Fixed a lot during reinstall

This commit is contained in:
Jeroen Vermeulen 2023-10-16 20:22:34 +02:00
parent 88e3493880
commit b4ff3dfd32
No known key found for this signature in database
63 changed files with 291 additions and 34 deletions

View File

@ -7,3 +7,5 @@
roles:
- role: dci_finish
- role: dci_reinstall_talos
# Don't call 'dci_finish' here after 'dci_reinstall_talos', it causes the server to reboot too early.
# 'dci_finish' is called from talos.yml

12
hostbill.yml Normal file
View File

@ -0,0 +1,12 @@
# Hostbill Playbook
---
- name: Hostbill Machine Name
become: false
gather_facts: false
hosts:
- control_lb_nodes
- talos
roles:
- role: hostbill_machine_name
tags:
- hostbill_machine_name

View File

@ -6,27 +6,34 @@
hosts:
- talos_first_nodes
roles:
- role: metallb
- role: k8s_metallb
tags:
- metallb
- role: traefik
- role: k8s_traefik
tags:
- traefik
- role: cert_manager
- role: k8s_cert_manager
tags:
- cert_manager
- role: rancher
- role: k8s_rancher
tags:
- rancher
- role: local_path_storage
- role: k8s_local_path_storage
tags:
- local_path_storage
- role: mayastor
- role: k8s_mayastor
tags:
- mayastor
- role: velero
- role: k8s_velero
tags:
- velero
- role: k8s_node_problem_detector
tags:
- k8s_node_problem_detector
- monitoring
- role: k8s_rancher_monitoring
tags:
- rancher_monitoring
- name: Install per-node services on K8S
become: false
@ -34,7 +41,7 @@
hosts:
- talos
roles:
- role: mayastor_diskpool
- role: k8s_mayastor_diskpool
tags:
- mayastor
- mayastor_diskpool

4
os.yml
View File

@ -15,7 +15,7 @@
hosts:
- debian
roles:
- role: debian
- role: os_debian
tags:
- debian
@ -23,6 +23,6 @@
hosts:
- ubuntu
roles:
- role: ubuntu
- role: os_ubuntu
tags:
- ubuntu

View File

@ -1,4 +1,9 @@
---
- name: Include playbook hostbill
ansible.builtin.import_playbook: hostbill.yml
tags:
- hostbill
- name: Include playbook talos
ansible.builtin.import_playbook: talos.yml
tags:

View File

@ -8,4 +8,5 @@
type: A
value: "{{ ansible_host }}"
api_token: "{{ cloudflare_token }}"
solo: true
register: record

View File

@ -1,7 +1,7 @@
---
kubernetes_version: v1.26.9 # Rancher can't run on v1.27, see https://www.suse.com/suse-rancher/support-matrix/all-supported-versions/rancher-v2-7-6/
talos_image_version: v1.5.2
talos_version: v1.5.3
talos_image_version: v1.4.7
talos_version: v1.5.2
ansible_root_dir: "{{ inventory_dir | ansible.builtin.dirname }}"
ansible_vault_password_file: "{{ ansible_root_dir }}/.ansible/vault_pass"
talos_generic_config_dir: "{{ ansible_root_dir }}/configs/talos"

View File

@ -14,3 +14,7 @@
register: dci_finish
until: dci_finish.status != 503
notify: Set fact dci_finish_called
changed_when: true
- name: Flush handlers
ansible.builtin.meta: flush_handlers

View File

@ -1,10 +1,14 @@
---
- name: Sleep a bit because we just called the 'Finish' operation at DCI Manager
# This role does not wipe the disks.
# If there is still a Talos config present on disk, the server will not go in Maintenance stage.
# You can clear the disk using DCI Manager: https://dcimanager6.snel.com/auth/login
- name: Sleep 2 minutes because we just called the 'Finish' operation at DCI Manager
when: dci_finish_called is defined
ansible.builtin.pause:
prompt: Please wait
seconds: 30
seconds: 120
- name: Call DCI Manager API to reinstall Talos Linux
delegate_to: "{{ dci_manager_access_host }}"
@ -23,6 +27,7 @@
password: "_not_used_"
register: _dci_reinstall
until: _dci_reinstall.status != 503 and _dci_reinstall.status != -1
changed_when: true
- name: Wait for Talos port 50000 to go down
delegate_to: "{{ talosctl_host }}"
@ -42,3 +47,11 @@
host: "{{ ansible_host }}"
port: 50000
timeout: 1200
- name: Import talos_machine_status tasks
ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"
- name: Verify Talos is in Maintenance stage
ansible.builtin.assert:
that: "talos_machine_status.spec.stage == 'maintenance'"
quiet: true

View File

@ -0,0 +1,6 @@
---
hostbill_api_hostname: my.snel.com
hostbill_api_ip: 193.33.60.161 # Normally is behind Cloudflare. We need to use IPv4 for our API ID to work.
hostbill_api_id: __SET_USING_GROUP_VARS__
hostbill_api_key: __SET_USING_GROUP_VARS__
hostbill_api_jumphost: jump.snel.com

View File

@ -0,0 +1,41 @@
---
- name: Lookup Hostbill account details by IP
become: false
delegate_to: "{{ hostbill_api_jumphost }}"
ansible.builtin.uri:
url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=searchServiceByIP&ip={{ ansible_host }}"
headers:
Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
validate_certs: false
register: _get_account_details
- name: Verify we found exactly one Hostbill account
ansible.builtin.assert:
fail_msg: "We found {{ _get_account_details.json.services | length() }} hostbill accounts"
quiet: true
that:
- "1 == _get_account_details.json.services | length()"
- name: Filter account details
ansible.builtin.set_fact:
_hostbill_account_details: "{{ _get_account_details.json.services[_get_account_details.json.services.keys() | first] }}"
- name: Verify we found exactly one VPS
when: "_hostbill_account_details.source == 'proxmox'"
ansible.builtin.assert:
fail_msg: "We found {{ _get_account_details.json.services | length() }} VMs"
quiet: true
that:
- "1 == _hostbill_account_details.vms | length()"
- name: Set hostname
when: "_hostbill_account_details.domain != inventory_hostname"
become: false
delegate_to: "{{ hostbill_api_jumphost }}"
ansible.builtin.uri:
url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=editAccountDetails&id={{ _hostbill_account_details.id }}&domain={{ inventory_hostname }}"
headers:
Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
validate_certs: false
changed_when: true

View File

@ -29,6 +29,7 @@
release_namespace: metallb-system
create_namespace: false
wait: true
timeout: "30m"
# https://github.com/metallb/metallb/blob/main/charts/metallb/values.yaml
- name: Addresss pool for MetalLB

View File

@ -0,0 +1,22 @@
---
# https://kubernetes.io/docs/tasks/debug/debug-cluster/monitor-node-health/
- name: Helm add deliveryhero repo
delegate_to: "{{ kubectl_host }}"
run_once: true
kubernetes.core.helm_repository:
name: deliveryhero
repo_url: "https://charts.deliveryhero.io"
- name: Helm deploy node-problem-detector
delegate_to: "{{ kubectl_host }}"
kubernetes.core.helm:
kubeconfig: "{{ kubeconfig }}"
chart_ref: deliveryhero/node-problem-detector
release_name: node-problem-detector
release_namespace: kube-system
create_namespace: false
wait: true
set_values:
# https://github.com/deliveryhero/helm-charts/blob/master/stable/node-problem-detector/values.yaml
# https://github.com/deliveryhero/helm-charts/tree/master/stable/node-problem-detector#values

View File

@ -152,5 +152,5 @@
url: "https://{{ rancher_hostname }}/v3/tokens?action=logout"
method: POST
headers:
Cookie: "R_SESS={{ _rancher_login.json.token }}"
Cookie: "R_SESS={{ _rancher_login.json.token | default(_rancher_pwchange_login.json.token) }}"
status_code: [200]

View File

@ -0,0 +1,85 @@
---
- name: Helm add Rancher Monitoring repo
delegate_to: "{{ kubectl_host }}"
run_once: true
kubernetes.core.helm_repository:
name: rancher-monitoring
repo_url: "https://raw.githubusercontent.com/rancher/charts/release-v2.8"
- name: Namespace
delegate_to: "{{ kubectl_host }}"
kubernetes.core.k8s:
kubeconfig: "{{ kubeconfig }}"
resource_definition:
apiVersion: v1
kind: Namespace
metadata:
name: cattle-monitoring-system
labels:
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/warn: privileged
- name: Helm deploy Rancher Monitoring
delegate_to: "{{ kubectl_host }}"
kubernetes.core.helm:
kubeconfig: "{{ kubeconfig }}"
chart_ref: rancher/rancher
release_name: rancher
release_namespace: cattle-monitoring-system
create_namespace: false
wait: true
# https://github.com/rancher/charts/blob/release-v2.8/charts/rancher-monitoring/102.0.1%2Bup40.1.2/values.yaml
values:
global:
cattle:
url: "https://{{ rancher_hostname }}"
grafana:
# https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
grafana.ini:
server:
domain: "{{ rancher_hostname }}"
root_url: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-grafana:80/proxy/"
persistence:
accessModes:
- ReadWriteOnce
enabled: true
size: 1Gi
storageClassName: mayastor-2replicas
type: pvc
sidecar:
dashboards:
enabled: true
searchNamespace: ALL
kube-state-metrics:
metricLabelsAllowlist:
- pods=[*]
- deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance]
prometheus:
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
prometheusSpec:
externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-prometheus:9090/proxy"
retentionSize: 10GiB
scrapeInterval: 60s
resources:
limits:
memory: 6000Mi
cpu: 2000m
requests:
memory: 3000Mi
cpu: 1500m
storageSpec:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
storageClassName: mayastor-2replicas
alertmanager:
alertmanagerSpec:
externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-alertmanager:9093/proxy"
logLevel: debug

View File

@ -0,0 +1,3 @@
---
dependencies:
- role: common

View File

@ -0,0 +1,16 @@
---
- name: Wait for APT Lock
ansible.builtin.include_tasks: shared/tasks/wait_apt.yml
- name: Uninstall chrony
ansible.builtin.apt:
name:
- chrony
state: absent
install_recommends: false
- name: Install systemd-timesyncd
ansible.builtin.apt:
name:
- systemd-timesyncd
install_recommends: false

View File

@ -10,8 +10,27 @@
- name: Import talos_machine_status tasks
ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"
# 'mode=try' does not work when the machine is in Maintenance mode.
- name: Dry run apply Talos node config
delegate_to: "{{ talosctl_host }}"
become: false
throttle: 1
ansible.builtin.command:
cmd: >-
talosctl apply-config
--dry-run
--file '{{ talos_node_config_file }}'
--nodes '{{ ansible_host }}'
--endpoints '{{ ansible_host }}'
{% if talos_machine_status.spec.stage == 'maintenance' %} --insecure{% endif %}
changed_when: false
environment:
TALOSCONFIG: "{{ talosconfig }}"
register: _talos_node_config_dry_run
# 'mode=try' does not work when the machine is in Maintenance mode.
- name: Apply Talos node config
when: '"No changes" not in _talos_node_config_dry_run.stderr'
delegate_to: "{{ talosctl_host }}"
become: false
throttle: 1

View File

@ -7,7 +7,8 @@
network:
hostname: "{{ inventory_hostname }}"
interfaces:
- interface: "{{ network_interface }}"
- deviceSelector:
busPath: "00:*"
addresses:
- "{{ ansible_host }}/{{ network_cidr_prefix }}"
routes:
@ -15,23 +16,33 @@
gateway: "{{ (ansible_host ~ '/' ~ network_cidr_prefix) | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}"
install:
disk: "{{ talos_disk }}"
image: "ghcr.io/siderolabs/installer:{{ talos_version }}"
kubelet:
extraArgs:
max-pods: "{% if 'talos_hardware_nodes' in group_names %}250{% else %}110{% endif %}"
- name: Create Talos interfaces bond append
when: "network_interface_bond is defined"
ansible.utils.update_fact:
updates:
- path: "_talos_override_config.machine.network.interfaces.0.bond"
value: "{{ network_interface_bond }}"
- path: "_talos_override_config.machine.network.interfaces.0.interface"
value: "bond0"
changed_when: false
register: _talos_override_update
- name: Apply Talos interfaces bond append
- name: Apply Talos interfaces bond append on override config
when: "network_interface_bond is defined"
ansible.builtin.set_fact:
_talos_override_config: "{{ _talos_override_update._talos_override_config }}"
- name: Remove deviceSelector
when: "network_interface_bond is defined"
ansible.builtin.set_fact:
_talos_override_config: "{{ _talos_override_config | ansible.utils.remove_keys(target=['deviceSelector']) }}"
- name: Create temp directory
delegate_to: "{{ talosctl_host }}"
ansible.builtin.file:
@ -68,4 +79,4 @@
--force
--with-docs=false
--with-examples=false
changed_when: true
changed_when: false

View File

@ -0,0 +1,3 @@
---
dependencies:
- role: common

View File

@ -12,8 +12,9 @@
cmd: >-
talosctl reset
--graceful=false
--wait=true
--system-labels-to-wipe=STATE,EPHEMERAL
--reboot
--wait=true
--endpoints '{{ ansible_host }}'
--nodes '{{ ansible_host }}'
changed_when: true

View File

@ -6,3 +6,4 @@
ansible.builtin.wait_for:
host: "{{ ansible_host }}"
port: 50000
timeout: 600

View File

@ -1,4 +0,0 @@
---
# @TODO https://api2.hostbillapp.com/accounts/searchServiceByIP.html

View File

@ -1,4 +1,4 @@
# Playbook Talos Install
# Talos Install Playbook
---
- name: Cloudflare DNS
become: false
@ -22,16 +22,6 @@
- talos_wait_port
- talos_config_apply
- name: Talos hardware nodes
become: false
gather_facts: false
hosts:
- talos_hardware_nodes
roles:
- role: dci_finish
tags:
- dci_finish
- name: Talos config
become: false
gather_facts: false
@ -52,3 +42,13 @@
- talos_first_nodes
roles:
- role: talos_bootstrap
- name: Talos hardware nodes
become: false
gather_facts: false
hosts:
- talos_hardware_nodes
roles:
- role: dci_finish
tags:
- dci_finish

8
talos_machine_status.yml Normal file
View File

@ -0,0 +1,8 @@
---
- name: Role talos_machine_status
gather_facts: false
hosts:
- talos
roles:
- role: talos_machine_status