Skip to content

QA-596: special hotbackup test case for cluster deployment (topology change) #533

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 25 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
178ed57
fix test.py
naushniki Mar 26, 2025
7e7533f
Merge remote-tracking branch 'origin/main' into fixTestPy
naushniki Mar 28, 2025
f4b7e85
Merge remote-tracking branch 'origin/main' into fixTestPy
naushniki Mar 28, 2025
5e58481
WIP
naushniki Mar 31, 2025
5df203b
WIP
naushniki Mar 31, 2025
505de52
lint
naushniki Mar 31, 2025
fd8d3ba
fmt
naushniki Mar 31, 2025
28a72aa
implement special test scenario for hotbackup on cluster deployment
naushniki Mar 26, 2025
c37c4da
WIP
naushniki Mar 31, 2025
c9019d3
WIP
naushniki Mar 31, 2025
df73c7b
lint
naushniki Mar 31, 2025
b337913
fmt
naushniki Mar 31, 2025
a1ce33a
add special hotbackup testing scenario for cluster deployment
naushniki Apr 1, 2025
c47e159
WIP
naushniki Apr 2, 2025
b4a9b14
WIP
naushniki Apr 2, 2025
e0c69b8
Merge remote-tracking branch 'origin/main' into fixTestPy
naushniki Apr 4, 2025
88107b3
Merge remote-tracking branch 'origin/main' into fixTestPy
naushniki Apr 9, 2025
625b383
Merge remote-tracking branch 'origin/main' into fixTestPy
naushniki Apr 9, 2025
77c4fad
Merge remote-tracking branch 'origin' into fixTestPy
naushniki Apr 15, 2025
d49cf8a
WIP
naushniki Apr 15, 2025
21e3b44
Merge remote-tracking branch 'origin/fixTestPy' into feature/QA-596/c…
naushniki Apr 15, 2025
87b88f3
WIP
naushniki Apr 16, 2025
dd7a3f3
WIP
naushniki Apr 16, 2025
ffb0c86
Merge branch 'fixTestPy' into feature/QA-596/clusterTopologyChange
naushniki Apr 24, 2025
0090053
Merge remote-tracking branch 'origin/main' into feature/QA-596/cluste…
naushniki Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 92 additions & 13 deletions release_tester/arangodb/starter/deployments/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ def add_starter(name, port, opts, sm, hasAgency):
self.create_tls_ca_cert()
port = 9528
count = 0
for this_node in list(range(1, self.props.cluster_nodes + 1)):
full_node_count = self.props.cluster_nodes + 2 # we need 2 additional nodes for hotbackup testing
for this_node in list(range(1, full_node_count + 1)):
node = []
node_opts.append(node)
if this_node != 1:
Expand All @@ -153,44 +154,43 @@ def add_starter(name, port, opts, sm, hasAgency):
add_starter(f"node{this_node}", port, node + common_opts, sm, count < 3)
port += 100
count += 1
self.backup_instance_count = count
for instance in self.starter_instances:
instance.is_leader = True

def starter_run_impl(self):
lh.subsection("instance setup")
for manager in self.starter_instances:
for manager in self.starter_instances[:self.props.cluster_nodes]:
logging.info("Spawning instance")
manager.run_starter()

logging.info("waiting for the starters to become alive")
not_started = self.starter_instances[:] # This is a explicit copy
not_running = self.get_running_starters() # This is a explicit copy
count = 0
while not_started:
logging.debug("waiting for mananger with logfile:" + str(not_started[-1].log_file))
if not_started[-1].is_instance_up():
not_started.pop()
while not_running:
logging.debug("waiting for mananger with logfile:" + str(not_running[-1].log_file))
if not_running[-1].is_instance_up():
not_running.pop()
progress(".")
time.sleep(1)
count += 1
if count > 120:
raise Exception("Cluster installation didn't come up in two minutes!")

logging.info("waiting for the cluster instances to become alive")
for node in self.starter_instances:
for node in self.get_running_starters():
node.detect_instances()
node.detect_instance_pids()
# self.cfg.add_frontend('http', self.cfg.publicip, str(node.get_frontend_port()))

logging.info("instances are ready - JWT: " + self.starter_instances[0].get_jwt_header())
count = 0
for node in self.starter_instances:
for node in self.get_running_starters():
node.set_passvoid("cluster", count == 0)
count += 1
self.passvoid = "cluster"

def finish_setup_impl(self):
self.makedata_instances = self.starter_instances[:]
self.makedata_instances = self.get_running_starters()
self.set_frontend_instances()

def _check_for_shards_in_sync(self):
Expand Down Expand Up @@ -483,12 +483,12 @@ def jam_attempt_impl(self):
# After attempt of jamming, we have peer for nodeX in setup.json.
# This peer will brake further updates because this peer is unavailable.
# It is necessary to remove this peer from json for each starter instance
for instance in self.starter_instances:
for instance in self.get_running_starters():
remove_node_x_from_json(instance.basedir)

def shutdown_impl(self):
ret = False
for node in self.starter_instances:
for node in self.get_running_starters():
ret = ret or node.terminate_instance()
logging.info("test ended")
return ret
Expand Down Expand Up @@ -528,3 +528,82 @@ def generate_keyfile(self, keyfile):
"--host=localhost",
]
)

@step
def test_hotbackup_impl(self):
""" test hotbackup feature: Cluster """
with step("step 1: create a backup"):
self.create_backup_and_upload("thy_name_is_" + self.name)
backup_from_step_1 = self.uploaded_backups[-1]

with step("step 2: create non-backup data"):
self.create_non_backup_data()
self.tcp_ping_all_nodes()

with step("step 3: add new db server"):
new_starter = self.get_not_running_starters()[0]
self.run_starter_and_wait(new_starter)
self.backup_instance_count += 1

with step("step 4: create a backup"):
self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server")
backup_from_step_4 = self.uploaded_backups[-1]

with step("step 5: remove old db server"):
if not self.starter_instances[1].have_this_instance(self.agency.get_leader()):
terminate_instance = self.starter_instances[1]
else:
terminate_instance = self.starter_instances[2]
terminated_dbserver_uuid = terminate_instance.get_dbserver().get_uuid()
terminate_instance.stop_dbserver()
self.remove_server_from_agency(terminated_dbserver_uuid)
self.backup_instance_count -= 1

with step("step 6: create another backup"):
self.create_backup_and_upload("thy_name_is_" + self.name + "_plus1_server_minus1_server")

with step("step 7: download and restore backup from step 1"):
self.download_backup(backup_from_step_1)
self.validate_local_backup(backup_from_step_1)
backups = self.list_backup()
if backups[-1] != backup_from_step_1:
raise Exception("downloaded backup has different name? " + str(backups))
self.restore_backup(backup_from_step_1)
self.tcp_ping_all_nodes()

with step("step 8: check data"):
self.check_data_impl()
if not self.check_non_backup_data():
raise Exception("data created after backup is still there??")

with step("step 9: add new db server"):
new_starter2 = self.get_not_running_starters()[0]
self.run_starter_and_wait(new_starter2)
self.backup_instance_count += 1

with step("step 10: download and restore backup from step 4"):
self.download_backup(backup_from_step_4)
self.validate_local_backup(backup_from_step_4)
backups = self.list_backup()
if backups[-1] != backup_from_step_4:
raise Exception("downloaded backup has different name? " + str(backups))
self.restore_backup(backup_from_step_4)
self.tcp_ping_all_nodes()

with step("step 11: check data"):
self.check_data_impl()

@staticmethod
def run_starter_and_wait(starter):
starter.run_starter()
count = 0
while not starter.is_instance_up():
logging.debug("waiting for mananger with logfile:" + str(starter.log_file))
progress(".")
time.sleep(1)
count += 1
if count > 120:
raise Exception("Starter manager installation didn't come up in two minutes!")
starter.detect_instances()
starter.detect_instance_pids()

Loading