Fixed a lot during reinstall
This commit is contained in:
parent
88e3493880
commit
b4ff3dfd32
|
@ -7,3 +7,5 @@
|
|||
roles:
|
||||
- role: dci_finish
|
||||
- role: dci_reinstall_talos
|
||||
# Don't call 'dci_finish' here after 'dci_reinstall_talos', it causes the server to reboot too early.
|
||||
# 'dci_finish' is called from talos.yml
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
# Hostbill Playbook
|
||||
---
|
||||
- name: Hostbill Machine Name
|
||||
become: false
|
||||
gather_facts: false
|
||||
hosts:
|
||||
- control_lb_nodes
|
||||
- talos
|
||||
roles:
|
||||
- role: hostbill_machine_name
|
||||
tags:
|
||||
- hostbill_machine_name
|
|
@ -6,27 +6,34 @@
|
|||
hosts:
|
||||
- talos_first_nodes
|
||||
roles:
|
||||
- role: metallb
|
||||
- role: k8s_metallb
|
||||
tags:
|
||||
- metallb
|
||||
- role: traefik
|
||||
- role: k8s_traefik
|
||||
tags:
|
||||
- traefik
|
||||
- role: cert_manager
|
||||
- role: k8s_cert_manager
|
||||
tags:
|
||||
- cert_manager
|
||||
- role: rancher
|
||||
- role: k8s_rancher
|
||||
tags:
|
||||
- rancher
|
||||
- role: local_path_storage
|
||||
- role: k8s_local_path_storage
|
||||
tags:
|
||||
- local_path_storage
|
||||
- role: mayastor
|
||||
- role: k8s_mayastor
|
||||
tags:
|
||||
- mayastor
|
||||
- role: velero
|
||||
- role: k8s_velero
|
||||
tags:
|
||||
- velero
|
||||
- role: k8s_node_problem_detector
|
||||
tags:
|
||||
- k8s_node_problem_detector
|
||||
- monitoring
|
||||
- role: k8s_rancher_monitoring
|
||||
tags:
|
||||
- rancher_monitoring
|
||||
|
||||
- name: Install per-node services on K8S
|
||||
become: false
|
||||
|
@ -34,7 +41,7 @@
|
|||
hosts:
|
||||
- talos
|
||||
roles:
|
||||
- role: mayastor_diskpool
|
||||
- role: k8s_mayastor_diskpool
|
||||
tags:
|
||||
- mayastor
|
||||
- mayastor_diskpool
|
||||
|
|
4
os.yml
4
os.yml
|
@ -15,7 +15,7 @@
|
|||
hosts:
|
||||
- debian
|
||||
roles:
|
||||
- role: debian
|
||||
- role: os_debian
|
||||
tags:
|
||||
- debian
|
||||
|
||||
|
@ -23,6 +23,6 @@
|
|||
hosts:
|
||||
- ubuntu
|
||||
roles:
|
||||
- role: ubuntu
|
||||
- role: os_ubuntu
|
||||
tags:
|
||||
- ubuntu
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
---
|
||||
- name: Include playbook hostbill
|
||||
ansible.builtin.import_playbook: hostbill.yml
|
||||
tags:
|
||||
- hostbill
|
||||
|
||||
- name: Include playbook talos
|
||||
ansible.builtin.import_playbook: talos.yml
|
||||
tags:
|
||||
|
|
|
@ -8,4 +8,5 @@
|
|||
type: A
|
||||
value: "{{ ansible_host }}"
|
||||
api_token: "{{ cloudflare_token }}"
|
||||
solo: true
|
||||
register: record
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
kubernetes_version: v1.26.9 # Rancher can't run on v1.27, see https://www.suse.com/suse-rancher/support-matrix/all-supported-versions/rancher-v2-7-6/
|
||||
talos_image_version: v1.5.2
|
||||
talos_version: v1.5.3
|
||||
talos_image_version: v1.4.7
|
||||
talos_version: v1.5.2
|
||||
ansible_root_dir: "{{ inventory_dir | ansible.builtin.dirname }}"
|
||||
ansible_vault_password_file: "{{ ansible_root_dir }}/.ansible/vault_pass"
|
||||
talos_generic_config_dir: "{{ ansible_root_dir }}/configs/talos"
|
||||
|
|
|
@ -14,3 +14,7 @@
|
|||
register: dci_finish
|
||||
until: dci_finish.status != 503
|
||||
notify: Set fact dci_finish_called
|
||||
changed_when: true
|
||||
|
||||
- name: Flush handlers
|
||||
ansible.builtin.meta: flush_handlers
|
||||
|
|
|
@ -1,10 +1,14 @@
|
|||
---
|
||||
|
||||
- name: Sleep a bit because we just called the 'Finish' operation at DCI Manager
|
||||
# This role does not wipe the disks.
|
||||
# If there is still a Talos config present on disk, the server will not go in Maintenance stage.
|
||||
# You can clear the disk using DCI Manager: https://dcimanager6.snel.com/auth/login
|
||||
|
||||
- name: Sleep 2 minutes because we just called the 'Finish' operation at DCI Manager
|
||||
when: dci_finish_called is defined
|
||||
ansible.builtin.pause:
|
||||
prompt: Please wait
|
||||
seconds: 30
|
||||
seconds: 120
|
||||
|
||||
- name: Call DCI Manager API to reinstall Talos Linux
|
||||
delegate_to: "{{ dci_manager_access_host }}"
|
||||
|
@ -23,6 +27,7 @@
|
|||
password: "_not_used_"
|
||||
register: _dci_reinstall
|
||||
until: _dci_reinstall.status != 503 and _dci_reinstall.status != -1
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for Talos port 50000 to go down
|
||||
delegate_to: "{{ talosctl_host }}"
|
||||
|
@ -42,3 +47,11 @@
|
|||
host: "{{ ansible_host }}"
|
||||
port: 50000
|
||||
timeout: 1200
|
||||
|
||||
- name: Import talos_machine_status tasks
|
||||
ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"
|
||||
|
||||
- name: Verify Talos is in Maintenance stage
|
||||
ansible.builtin.assert:
|
||||
that: "talos_machine_status.spec.stage == 'maintenance'"
|
||||
quiet: true
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
hostbill_api_hostname: my.snel.com
|
||||
hostbill_api_ip: 193.33.60.161 # Normally is behind Cloudflare. We need to use IPv4 for our API ID to work.
|
||||
hostbill_api_id: __SET_USING_GROUP_VARS__
|
||||
hostbill_api_key: __SET_USING_GROUP_VARS__
|
||||
hostbill_api_jumphost: jump.snel.com
|
|
@ -0,0 +1,41 @@
|
|||
---
|
||||
|
||||
- name: Lookup Hostbill account details by IP
|
||||
become: false
|
||||
delegate_to: "{{ hostbill_api_jumphost }}"
|
||||
ansible.builtin.uri:
|
||||
url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=searchServiceByIP&ip={{ ansible_host }}"
|
||||
headers:
|
||||
Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
|
||||
validate_certs: false
|
||||
register: _get_account_details
|
||||
|
||||
- name: Verify we found exactly one Hostbill account
|
||||
ansible.builtin.assert:
|
||||
fail_msg: "We found {{ _get_account_details.json.services | length() }} hostbill accounts"
|
||||
quiet: true
|
||||
that:
|
||||
- "1 == _get_account_details.json.services | length()"
|
||||
|
||||
- name: Filter account details
|
||||
ansible.builtin.set_fact:
|
||||
_hostbill_account_details: "{{ _get_account_details.json.services[_get_account_details.json.services.keys() | first] }}"
|
||||
|
||||
- name: Verify we found exactly one VPS
|
||||
when: "_hostbill_account_details.source == 'proxmox'"
|
||||
ansible.builtin.assert:
|
||||
fail_msg: "We found {{ _get_account_details.json.services | length() }} VMs"
|
||||
quiet: true
|
||||
that:
|
||||
- "1 == _hostbill_account_details.vms | length()"
|
||||
|
||||
- name: Set hostname
|
||||
when: "_hostbill_account_details.domain != inventory_hostname"
|
||||
become: false
|
||||
delegate_to: "{{ hostbill_api_jumphost }}"
|
||||
ansible.builtin.uri:
|
||||
url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=editAccountDetails&id={{ _hostbill_account_details.id }}&domain={{ inventory_hostname }}"
|
||||
headers:
|
||||
Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
|
||||
validate_certs: false
|
||||
changed_when: true
|
|
@ -29,6 +29,7 @@
|
|||
release_namespace: metallb-system
|
||||
create_namespace: false
|
||||
wait: true
|
||||
timeout: "30m"
|
||||
# https://github.com/metallb/metallb/blob/main/charts/metallb/values.yaml
|
||||
|
||||
- name: Addresss pool for MetalLB
|
|
@ -0,0 +1,22 @@
|
|||
---
|
||||
# https://kubernetes.io/docs/tasks/debug/debug-cluster/monitor-node-health/
|
||||
|
||||
- name: Helm add deliveryhero repo
|
||||
delegate_to: "{{ kubectl_host }}"
|
||||
run_once: true
|
||||
kubernetes.core.helm_repository:
|
||||
name: deliveryhero
|
||||
repo_url: "https://charts.deliveryhero.io"
|
||||
|
||||
- name: Helm deploy node-problem-detector
|
||||
delegate_to: "{{ kubectl_host }}"
|
||||
kubernetes.core.helm:
|
||||
kubeconfig: "{{ kubeconfig }}"
|
||||
chart_ref: deliveryhero/node-problem-detector
|
||||
release_name: node-problem-detector
|
||||
release_namespace: kube-system
|
||||
create_namespace: false
|
||||
wait: true
|
||||
set_values:
|
||||
# https://github.com/deliveryhero/helm-charts/blob/master/stable/node-problem-detector/values.yaml
|
||||
# https://github.com/deliveryhero/helm-charts/tree/master/stable/node-problem-detector#values
|
|
@ -152,5 +152,5 @@
|
|||
url: "https://{{ rancher_hostname }}/v3/tokens?action=logout"
|
||||
method: POST
|
||||
headers:
|
||||
Cookie: "R_SESS={{ _rancher_login.json.token }}"
|
||||
Cookie: "R_SESS={{ _rancher_login.json.token | default(_rancher_pwchange_login.json.token) }}"
|
||||
status_code: [200]
|
|
@ -0,0 +1,85 @@
|
|||
---
|
||||
|
||||
- name: Helm add Rancher Monitoring repo
|
||||
delegate_to: "{{ kubectl_host }}"
|
||||
run_once: true
|
||||
kubernetes.core.helm_repository:
|
||||
name: rancher-monitoring
|
||||
repo_url: "https://raw.githubusercontent.com/rancher/charts/release-v2.8"
|
||||
|
||||
- name: Namespace
|
||||
delegate_to: "{{ kubectl_host }}"
|
||||
kubernetes.core.k8s:
|
||||
kubeconfig: "{{ kubeconfig }}"
|
||||
resource_definition:
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cattle-monitoring-system
|
||||
labels:
|
||||
pod-security.kubernetes.io/audit: privileged
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
pod-security.kubernetes.io/warn: privileged
|
||||
|
||||
- name: Helm deploy Rancher Monitoring
|
||||
delegate_to: "{{ kubectl_host }}"
|
||||
kubernetes.core.helm:
|
||||
kubeconfig: "{{ kubeconfig }}"
|
||||
chart_ref: rancher/rancher
|
||||
release_name: rancher
|
||||
release_namespace: cattle-monitoring-system
|
||||
create_namespace: false
|
||||
wait: true
|
||||
# https://github.com/rancher/charts/blob/release-v2.8/charts/rancher-monitoring/102.0.1%2Bup40.1.2/values.yaml
|
||||
values:
|
||||
global:
|
||||
cattle:
|
||||
url: "https://{{ rancher_hostname }}"
|
||||
grafana:
|
||||
# https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
|
||||
grafana.ini:
|
||||
server:
|
||||
domain: "{{ rancher_hostname }}"
|
||||
root_url: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-grafana:80/proxy/"
|
||||
persistence:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
enabled: true
|
||||
size: 1Gi
|
||||
storageClassName: mayastor-2replicas
|
||||
type: pvc
|
||||
sidecar:
|
||||
dashboards:
|
||||
enabled: true
|
||||
searchNamespace: ALL
|
||||
kube-state-metrics:
|
||||
metricLabelsAllowlist:
|
||||
- pods=[*]
|
||||
- deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance]
|
||||
prometheus:
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
prometheusSpec:
|
||||
externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-prometheus:9090/proxy"
|
||||
retentionSize: 10GiB
|
||||
scrapeInterval: 60s
|
||||
resources:
|
||||
limits:
|
||||
memory: 6000Mi
|
||||
cpu: 2000m
|
||||
requests:
|
||||
memory: 3000Mi
|
||||
cpu: 1500m
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
storageClassName: mayastor-2replicas
|
||||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-alertmanager:9093/proxy"
|
||||
logLevel: debug
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
dependencies:
|
||||
- role: common
|
|
@ -0,0 +1,16 @@
|
|||
---
|
||||
- name: Wait for APT Lock
|
||||
ansible.builtin.include_tasks: shared/tasks/wait_apt.yml
|
||||
|
||||
- name: Uninstall chrony
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- chrony
|
||||
state: absent
|
||||
install_recommends: false
|
||||
|
||||
- name: Install systemd-timesyncd
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- systemd-timesyncd
|
||||
install_recommends: false
|
|
@ -10,8 +10,27 @@
|
|||
- name: Import talos_machine_status tasks
|
||||
ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"
|
||||
|
||||
# 'mode=try' does not work when the machine is in Maintenance mode.
|
||||
- name: Dry run apply Talos node config
|
||||
delegate_to: "{{ talosctl_host }}"
|
||||
become: false
|
||||
throttle: 1
|
||||
ansible.builtin.command:
|
||||
cmd: >-
|
||||
talosctl apply-config
|
||||
--dry-run
|
||||
--file '{{ talos_node_config_file }}'
|
||||
--nodes '{{ ansible_host }}'
|
||||
--endpoints '{{ ansible_host }}'
|
||||
{% if talos_machine_status.spec.stage == 'maintenance' %} --insecure{% endif %}
|
||||
changed_when: false
|
||||
environment:
|
||||
TALOSCONFIG: "{{ talosconfig }}"
|
||||
register: _talos_node_config_dry_run
|
||||
|
||||
# 'mode=try' does not work when the machine is in Maintenance mode.
|
||||
- name: Apply Talos node config
|
||||
when: '"No changes" not in _talos_node_config_dry_run.stderr'
|
||||
delegate_to: "{{ talosctl_host }}"
|
||||
become: false
|
||||
throttle: 1
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
network:
|
||||
hostname: "{{ inventory_hostname }}"
|
||||
interfaces:
|
||||
- interface: "{{ network_interface }}"
|
||||
- deviceSelector:
|
||||
busPath: "00:*"
|
||||
addresses:
|
||||
- "{{ ansible_host }}/{{ network_cidr_prefix }}"
|
||||
routes:
|
||||
|
@ -15,23 +16,33 @@
|
|||
gateway: "{{ (ansible_host ~ '/' ~ network_cidr_prefix) | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}"
|
||||
install:
|
||||
disk: "{{ talos_disk }}"
|
||||
image: "ghcr.io/siderolabs/installer:{{ talos_version }}"
|
||||
kubelet:
|
||||
extraArgs:
|
||||
max-pods: "{% if 'talos_hardware_nodes' in group_names %}250{% else %}110{% endif %}"
|
||||
|
||||
|
||||
- name: Create Talos interfaces bond append
|
||||
when: "network_interface_bond is defined"
|
||||
ansible.utils.update_fact:
|
||||
updates:
|
||||
- path: "_talos_override_config.machine.network.interfaces.0.bond"
|
||||
value: "{{ network_interface_bond }}"
|
||||
- path: "_talos_override_config.machine.network.interfaces.0.interface"
|
||||
value: "bond0"
|
||||
changed_when: false
|
||||
register: _talos_override_update
|
||||
|
||||
- name: Apply Talos interfaces bond append
|
||||
- name: Apply Talos interfaces bond append on override config
|
||||
when: "network_interface_bond is defined"
|
||||
ansible.builtin.set_fact:
|
||||
_talos_override_config: "{{ _talos_override_update._talos_override_config }}"
|
||||
|
||||
- name: Remove deviceSelector
|
||||
when: "network_interface_bond is defined"
|
||||
ansible.builtin.set_fact:
|
||||
_talos_override_config: "{{ _talos_override_config | ansible.utils.remove_keys(target=['deviceSelector']) }}"
|
||||
|
||||
- name: Create temp directory
|
||||
delegate_to: "{{ talosctl_host }}"
|
||||
ansible.builtin.file:
|
||||
|
@ -68,4 +79,4 @@
|
|||
--force
|
||||
--with-docs=false
|
||||
--with-examples=false
|
||||
changed_when: true
|
||||
changed_when: false
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
---
|
||||
dependencies:
|
||||
- role: common
|
|
@ -12,8 +12,9 @@
|
|||
cmd: >-
|
||||
talosctl reset
|
||||
--graceful=false
|
||||
--wait=true
|
||||
--system-labels-to-wipe=STATE,EPHEMERAL
|
||||
--reboot
|
||||
--wait=true
|
||||
--endpoints '{{ ansible_host }}'
|
||||
--nodes '{{ ansible_host }}'
|
||||
changed_when: true
|
||||
|
|
|
@ -6,3 +6,4 @@
|
|||
ansible.builtin.wait_for:
|
||||
host: "{{ ansible_host }}"
|
||||
port: 50000
|
||||
timeout: 600
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
---
|
||||
|
||||
# @TODO https://api2.hostbillapp.com/accounts/searchServiceByIP.html
|
||||
|
22
talos.yml
22
talos.yml
|
@ -1,4 +1,4 @@
|
|||
# Playbook Talos Install
|
||||
# Talos Install Playbook
|
||||
---
|
||||
- name: Cloudflare DNS
|
||||
become: false
|
||||
|
@ -22,16 +22,6 @@
|
|||
- talos_wait_port
|
||||
- talos_config_apply
|
||||
|
||||
- name: Talos hardware nodes
|
||||
become: false
|
||||
gather_facts: false
|
||||
hosts:
|
||||
- talos_hardware_nodes
|
||||
roles:
|
||||
- role: dci_finish
|
||||
tags:
|
||||
- dci_finish
|
||||
|
||||
- name: Talos config
|
||||
become: false
|
||||
gather_facts: false
|
||||
|
@ -52,3 +42,13 @@
|
|||
- talos_first_nodes
|
||||
roles:
|
||||
- role: talos_bootstrap
|
||||
|
||||
- name: Talos hardware nodes
|
||||
become: false
|
||||
gather_facts: false
|
||||
hosts:
|
||||
- talos_hardware_nodes
|
||||
roles:
|
||||
- role: dci_finish
|
||||
tags:
|
||||
- dci_finish
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
---
|
||||
|
||||
- name: Role talos_machine_status
|
||||
gather_facts: false
|
||||
hosts:
|
||||
- talos
|
||||
roles:
|
||||
- role: talos_machine_status
|
Loading…
Reference in New Issue