Skip to content

Commit

Permalink
kubeadm,rke2,k3s: always wait for all nodes to join when deploying mo…
Browse files Browse the repository at this point in the history
…re than one
  • Loading branch information
karmab committed Sep 7, 2024
1 parent 52a623d commit 0f25935
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 29 deletions.
8 changes: 8 additions & 0 deletions kvirt/cluster/k3s/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ipaddress import ip_network
from kvirt.common import error, success, pprint, warning, get_kubectl, info2, container_mode, kube_create_app
from kvirt.common import deploy_cloud_storage, wait_cloud_dns, update_etc_hosts, fix_typos, get_cluster_api_vips
from kvirt.common import wait_for_nodes
import os
import re
from random import choice
Expand Down Expand Up @@ -103,6 +104,8 @@ def scale(config, plandir, cluster, overrides):
result = config.plan(plan, inputfile=f'{plandir}/{role}.yml', overrides=overrides, threaded=threaded)
if result['result'] != 'success':
return result
else:
pprint(f"{role.capitalize()} Nodes will join the cluster in the following minutes")
if cloud_native and provider == 'gcp':
pprint("Updating ip alias ranges")
update_ip_alias(config, ctlplanes + workers)
Expand Down Expand Up @@ -275,6 +278,11 @@ def create(config, plandir, cluster, overrides):
if f'{app}_version' not in overrides:
app_data[f'{app}_version'] = 'latest'
kube_create_app(config, app, appdir, overrides=app_data)
if ctlplanes + workers > 1:
ready = wait_for_nodes(ctlplanes + workers)
if not ready:
msg = "Timeout waiting for all nodes to join"
return {'result': 'failure', 'reason': msg}
if autoscale:
config.import_in_kube(network=network, secure=True)
with NamedTemporaryFile(mode='w+t') as temp:
Expand Down
23 changes: 5 additions & 18 deletions kvirt/cluster/kubeadm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import json
from kvirt.common import success, pprint, warning, info2, container_mode, wait_cloud_dns, update_etc_hosts, fix_typos
from kvirt.common import get_kubectl, kube_create_app, get_ssh_pub_key, _ssh_credentials, ssh, deploy_cloud_storage
from kvirt.common import get_cluster_api_vips
from kvirt.common import get_cluster_api_vips, wait_for_nodes
from kvirt.defaults import UBUNTUS
import os
from random import choice
Expand All @@ -12,7 +12,6 @@
from string import ascii_lowercase, ascii_letters, digits
from subprocess import call
from tempfile import NamedTemporaryFile
from time import sleep
from urllib.request import urlopen
from yaml import safe_dump, safe_load

Expand Down Expand Up @@ -89,6 +88,8 @@ def scale(config, plandir, cluster, overrides):
result = config.plan(plan, inputfile=f'{plandir}/{role}.yml', overrides=overrides, threaded=threaded)
if result['result'] != 'success':
return result
else:
pprint(f"{role.capitalize()} Nodes will join the cluster in the following minutes")
return {'result': 'success'}


Expand Down Expand Up @@ -309,25 +310,11 @@ def create(config, plandir, cluster, overrides):
if f'{app}_version' not in overrides:
app_data[f'{app}_version'] = 'latest'
kube_create_app(config, app, appdir, overrides=app_data)
if data['wait_ready']:
timeout = 600
counter = 0
ready = False
while True:
if len(os.popen("kubectl get node -o name").readlines()) == ctlplanes + workers:
ready = True
break
elif counter > timeout:
break
else:
pprint("Waiting 30s for all nodes to join")
sleep(30)
counter += 30
if ctlplanes + workers > 1:
ready = wait_for_nodes(ctlplanes + workers)
if not ready:
msg = "Timeout waiting for all nodes to join"
return {'result': 'failure', 'reason': msg}
else:
warning("Not waiting on all nodes to join the cluster")
if autoscale:
config.import_in_kube(network=network, secure=True)
with NamedTemporaryFile(mode='w+t') as temp:
Expand Down
1 change: 0 additions & 1 deletion kvirt/cluster/kubeadm/kcli_default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ ctlplanes_threaded: false
workers_threaded: false
keys: []
tempkey: false
wait_ready: true
calico_version: None
autoscale: False
token:
Expand Down
17 changes: 8 additions & 9 deletions kvirt/cluster/rke2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from ipaddress import ip_network
from kvirt.common import success, pprint, warning, info2, container_mode, wait_cloud_dns, update_etc_hosts, fix_typos
from kvirt.common import get_kubectl, get_ssh_pub_key, _ssh_credentials, ssh, deploy_cloud_storage
from kvirt.common import get_kubectl, get_ssh_pub_key, _ssh_credentials, ssh, deploy_cloud_storage, wait_for_nodes
from kvirt.defaults import UBUNTUS
import os
from random import choice
Expand All @@ -9,7 +9,6 @@
from string import ascii_lowercase, ascii_letters, digits
from subprocess import call
from tempfile import NamedTemporaryFile
from time import sleep
from yaml import safe_dump, safe_load

cloud_providers = ['aws', 'azure', 'gcp', 'ibm']
Expand Down Expand Up @@ -73,6 +72,8 @@ def scale(config, plandir, cluster, overrides):
result = config.plan(plan, inputfile=f'{plandir}/{role}.yml', overrides=overrides, threaded=threaded)
if result['result'] != 'success':
return result
else:
pprint(f"{role.capitalize()} Nodes will join the cluster in the following minutes")
return {'result': 'success'}


Expand Down Expand Up @@ -230,13 +231,11 @@ def create(config, plandir, cluster, overrides):
update_etc_hosts(cluster, domain, lb_ip)
break
os.environ['KUBECONFIG'] = f"{clusterdir}/auth/kubeconfig"
if data['wait_ready']:
pprint("Waiting for all nodes to join cluster")
while True:
if len(os.popen("kubectl get node -o name").readlines()) == ctlplanes + workers:
break
else:
sleep(10)
if ctlplanes + workers > 1:
ready = wait_for_nodes(ctlplanes + workers)
if not ready:
msg = "Timeout waiting for all nodes to join"
return {'result': 'failure', 'reason': msg}
if autoscale:
config.import_in_kube(network=network, secure=True)
with NamedTemporaryFile(mode='w+t') as temp:
Expand Down
1 change: 0 additions & 1 deletion kvirt/cluster/rke2/kcli_default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ ctlplanes_threaded: false
workers_threaded: false
keys: []
tempkey: false
wait_ready: false
autoscale: False
async: false
cloud_dns: false
Expand Down
15 changes: 15 additions & 0 deletions kvirt/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2557,3 +2557,18 @@ def get_cluster_api_vips():
if automatic and api_ip is not None:
data[network] = 1 if network not in data else data[network] + 1
return data


def wait_for_nodes(number):
timeout = 480
counter = 0
while True:
if len(os.popen("kubectl get node -o name").readlines()) == number:
return True
elif counter > timeout:
return False
else:
pprint("Waiting 30s for all nodes to join")
sleep(30)
counter += 30
return False

0 comments on commit 0f25935

Please sign in to comment.