Fixed a lot during reinstall

2023-10-16 20:22:34 +02:00 · 2023-10-16 20:22:34 +02:00 · b4ff3dfd32
parent 88e3493880
commit b4ff3dfd32
63 changed files with 291 additions and 34 deletions
--- a/dci_reinstall_talos.yml
+++ b/dci_reinstall_talos.yml
@ -7,3 +7,5 @@
  roles:
    - role: dci_finish
    - role: dci_reinstall_talos
+  # Don't call 'dci_finish' here after 'dci_reinstall_talos', it causes the server to reboot too early.
+  # 'dci_finish' is called from talos.yml
--- a/hostbill.yml
+++ b/hostbill.yml
@ -0,0 +1,12 @@
+# Hostbill Playbook
+---
+- name: Hostbill Machine Name
+  become: false
+  gather_facts: false
+  hosts:
+    - control_lb_nodes
+    - talos
+  roles:
+    - role: hostbill_machine_name
+      tags:
+        - hostbill_machine_name
--- a/k8s_services.yml
+++ b/k8s_services.yml
@ -6,27 +6,34 @@
  hosts:
    - talos_first_nodes
  roles:
-    - role: metallb
+    - role: k8s_metallb
      tags:
        - metallb
-    - role: traefik
+    - role: k8s_traefik
      tags:
        - traefik
-    - role: cert_manager
+    - role: k8s_cert_manager
      tags:
        - cert_manager
-    - role: rancher
+    - role: k8s_rancher
      tags:
        - rancher
-    - role: local_path_storage
+    - role: k8s_local_path_storage
      tags:
        - local_path_storage
-    - role: mayastor
+    - role: k8s_mayastor
      tags:
        - mayastor
-    - role: velero
+    - role: k8s_velero
      tags:
        - velero
+    - role: k8s_node_problem_detector
+      tags:
+        - k8s_node_problem_detector
+        - monitoring
+    - role: k8s_rancher_monitoring
+      tags:
+        - rancher_monitoring

 - name: Install per-node services on K8S
  become: false
@ -34,7 +41,7 @@
  hosts:
    - talos
  roles:
-    - role: mayastor_diskpool
+    - role: k8s_mayastor_diskpool
      tags:
        - mayastor
        - mayastor_diskpool
--- a/os.yml
+++ b/os.yml
@ -15,7 +15,7 @@
  hosts:
    - debian
  roles:
-    - role: debian
+    - role: os_debian
      tags:
        - debian

@ -23,6 +23,6 @@
  hosts:
    - ubuntu
  roles:
-    - role: ubuntu
+    - role: os_ubuntu
      tags:
        - ubuntu
--- a/playbook.yml
+++ b/playbook.yml
@ -1,4 +1,9 @@
 ---
+- name: Include playbook hostbill
+  ansible.builtin.import_playbook: hostbill.yml
+  tags:
+    - hostbill
+
 - name: Include playbook talos
  ansible.builtin.import_playbook: talos.yml
  tags:
--- a/roles/cloudflare_dns/tasks/main.yml
+++ b/roles/cloudflare_dns/tasks/main.yml
@ -8,4 +8,5 @@
    type: A
    value: "{{ ansible_host }}"
    api_token: "{{ cloudflare_token }}"
+    solo: true
  register: record
--- a/roles/common/defaults/main.yml
+++ b/roles/common/defaults/main.yml
@ -1,7 +1,7 @@
 ---
 kubernetes_version: v1.26.9 # Rancher can't run on v1.27, see https://www.suse.com/suse-rancher/support-matrix/all-supported-versions/rancher-v2-7-6/
-talos_image_version: v1.5.2
-talos_version: v1.5.3
+talos_image_version: v1.4.7
+talos_version: v1.5.2
 ansible_root_dir: "{{ inventory_dir | ansible.builtin.dirname }}"
 ansible_vault_password_file: "{{ ansible_root_dir }}/.ansible/vault_pass"
 talos_generic_config_dir: "{{ ansible_root_dir }}/configs/talos"
--- a/roles/dci_finish/tasks/main.yml
+++ b/roles/dci_finish/tasks/main.yml
@ -14,3 +14,7 @@
  register: dci_finish
  until: dci_finish.status != 503
  notify: Set fact dci_finish_called
+  changed_when: true
+
+- name: Flush handlers
+  ansible.builtin.meta: flush_handlers
--- a/roles/dci_reinstall_talos/tasks/main.yml
+++ b/roles/dci_reinstall_talos/tasks/main.yml
@ -1,10 +1,14 @@
 ---

- name: Sleep a bit because we just called the 'Finish' operation at DCI Manager
+# This role does not wipe the disks.
+# If there is still a Talos config present on disk, the server will not go in Maintenance stage.
+# You can clear the disk using DCI Manager:  https://dcimanager6.snel.com/auth/login
+
+- name: Sleep 2 minutes because we just called the 'Finish' operation at DCI Manager
  when: dci_finish_called is defined
  ansible.builtin.pause:
    prompt: Please wait
-    seconds: 30
+    seconds: 120

 - name: Call DCI Manager API to reinstall Talos Linux
  delegate_to: "{{ dci_manager_access_host }}"
@ -23,6 +27,7 @@
      password: "_not_used_"
  register: _dci_reinstall
  until: _dci_reinstall.status != 503 and _dci_reinstall.status != -1
+  changed_when: true

 - name: Wait for Talos port 50000 to go down
  delegate_to: "{{ talosctl_host }}"
@ -42,3 +47,11 @@
    host: "{{ ansible_host }}"
    port: 50000
    timeout: 1200
+
+- name: Import talos_machine_status tasks
+  ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"
+
+- name: Verify Talos is in Maintenance stage
+  ansible.builtin.assert:
+    that: "talos_machine_status.spec.stage == 'maintenance'"
+    quiet: true
--- a/roles/hostbill_machine_name/defaults/main.yml
+++ b/roles/hostbill_machine_name/defaults/main.yml
@ -0,0 +1,6 @@
+---
+hostbill_api_hostname: my.snel.com
+hostbill_api_ip: 193.33.60.161 # Normally is behind Cloudflare. We need to use IPv4 for our API ID to work.
+hostbill_api_id: __SET_USING_GROUP_VARS__
+hostbill_api_key: __SET_USING_GROUP_VARS__
+hostbill_api_jumphost: jump.snel.com
--- a/roles/hostbill_machine_name/tasks/main.yml
+++ b/roles/hostbill_machine_name/tasks/main.yml
@ -0,0 +1,41 @@
+---
+
+- name: Lookup Hostbill account details by IP
+  become: false
+  delegate_to: "{{ hostbill_api_jumphost }}"
+  ansible.builtin.uri:
+    url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=searchServiceByIP&ip={{ ansible_host }}"
+    headers:
+      Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
+    validate_certs: false
+  register: _get_account_details
+
+- name: Verify we found exactly one Hostbill account
+  ansible.builtin.assert:
+    fail_msg: "We found {{ _get_account_details.json.services | length() }} hostbill accounts"
+    quiet: true
+    that:
+      - "1 == _get_account_details.json.services | length()"
+
+- name: Filter account details
+  ansible.builtin.set_fact:
+    _hostbill_account_details: "{{ _get_account_details.json.services[_get_account_details.json.services.keys() | first] }}"
+
+- name: Verify we found exactly one VPS
+  when: "_hostbill_account_details.source == 'proxmox'"
+  ansible.builtin.assert:
+    fail_msg: "We found {{ _get_account_details.json.services | length() }} VMs"
+    quiet: true
+    that:
+      - "1 == _hostbill_account_details.vms | length()"
+
+- name: Set hostname
+  when: "_hostbill_account_details.domain != inventory_hostname"
+  become: false
+  delegate_to: "{{ hostbill_api_jumphost }}"
+  ansible.builtin.uri:
+    url: "https://{{ hostbill_api_ip }}/admin/api.php?api_id={{ hostbill_api_id }}&api_key={{ hostbill_api_key }}&call=editAccountDetails&id={{ _hostbill_account_details.id }}&domain={{ inventory_hostname }}"
+    headers:
+      Host: "{{ hostbill_api_hostname }}" # We have to do it this way to force using IPv4
+    validate_certs: false
+  changed_when: true
--- a/roles/k8s_cert_manager/meta/main.yml
+++ b/roles/k8s_cert_manager/meta/main.yml
--- a/roles/k8s_cert_manager/tasks/main.yml
+++ b/roles/k8s_cert_manager/tasks/main.yml
--- a/roles/k8s_local_path_storage/tasks/main.yml
+++ b/roles/k8s_local_path_storage/tasks/main.yml
--- a/roles/k8s_mayastor/meta/main.yml
+++ b/roles/k8s_mayastor/meta/main.yml
--- a/roles/k8s_mayastor/tasks/main.yml
+++ b/roles/k8s_mayastor/tasks/main.yml
--- a/roles/k8s_mayastor_diskpool/meta/main.yml
+++ b/roles/k8s_mayastor_diskpool/meta/main.yml
--- a/roles/k8s_mayastor_diskpool/tasks/main.yml
+++ b/roles/k8s_mayastor_diskpool/tasks/main.yml
--- a/roles/mayastor_diskpool/meta/main.yml
+++ b/roles/mayastor_diskpool/meta/main.yml
--- a/roles/k8s_metallb/tasks/main.yml
+++ b/roles/k8s_metallb/tasks/main.yml
@ -29,6 +29,7 @@
    release_namespace: metallb-system
    create_namespace: false
    wait: true
+    timeout: "30m"
    # https://github.com/metallb/metallb/blob/main/charts/metallb/values.yaml

 - name: Addresss pool for MetalLB
--- a/roles/k8s_node_problem_detector/tasks/main.yml
+++ b/roles/k8s_node_problem_detector/tasks/main.yml
@ -0,0 +1,22 @@
+---
+# https://kubernetes.io/docs/tasks/debug/debug-cluster/monitor-node-health/
+
+- name: Helm add deliveryhero repo
+  delegate_to: "{{ kubectl_host }}"
+  run_once: true
+  kubernetes.core.helm_repository:
+    name: deliveryhero
+    repo_url: "https://charts.deliveryhero.io"
+
+- name: Helm deploy node-problem-detector
+  delegate_to: "{{ kubectl_host }}"
+  kubernetes.core.helm:
+    kubeconfig: "{{ kubeconfig }}"
+    chart_ref: deliveryhero/node-problem-detector
+    release_name: node-problem-detector
+    release_namespace: kube-system
+    create_namespace: false
+    wait: true
+    set_values:
+      # https://github.com/deliveryhero/helm-charts/blob/master/stable/node-problem-detector/values.yaml
+      # https://github.com/deliveryhero/helm-charts/tree/master/stable/node-problem-detector#values
--- a/roles/k8s_rancher/meta/main.yml
+++ b/roles/k8s_rancher/meta/main.yml
--- a/roles/k8s_rancher/tasks/main.yml
+++ b/roles/k8s_rancher/tasks/main.yml
@ -152,5 +152,5 @@
    url: "https://{{ rancher_hostname }}/v3/tokens?action=logout"
    method: POST
    headers:
-      Cookie: "R_SESS={{ _rancher_login.json.token }}"
+      Cookie: "R_SESS={{ _rancher_login.json.token | default(_rancher_pwchange_login.json.token) }}"
    status_code: [200]
--- a/roles/k8s_rancher_monitoring/meta/main.yml
+++ b/roles/k8s_rancher_monitoring/meta/main.yml
--- a/roles/k8s_rancher_monitoring/tasks/main.yml
+++ b/roles/k8s_rancher_monitoring/tasks/main.yml
@ -0,0 +1,85 @@
+---
+
+- name: Helm add Rancher Monitoring repo
+  delegate_to: "{{ kubectl_host }}"
+  run_once: true
+  kubernetes.core.helm_repository:
+    name: rancher-monitoring
+    repo_url: "https://raw.githubusercontent.com/rancher/charts/release-v2.8"
+
+- name: Namespace
+  delegate_to: "{{ kubectl_host }}"
+  kubernetes.core.k8s:
+    kubeconfig: "{{ kubeconfig }}"
+    resource_definition:
+      apiVersion: v1
+      kind: Namespace
+      metadata:
+        name: cattle-monitoring-system
+        labels:
+          pod-security.kubernetes.io/audit: privileged
+          pod-security.kubernetes.io/enforce: privileged
+          pod-security.kubernetes.io/warn: privileged
+
+- name: Helm deploy Rancher Monitoring
+  delegate_to: "{{ kubectl_host }}"
+  kubernetes.core.helm:
+    kubeconfig: "{{ kubeconfig }}"
+    chart_ref: rancher/rancher
+    release_name: rancher
+    release_namespace: cattle-monitoring-system
+    create_namespace: false
+    wait: true
+    # https://github.com/rancher/charts/blob/release-v2.8/charts/rancher-monitoring/102.0.1%2Bup40.1.2/values.yaml
+    values:
+      global:
+        cattle:
+          url: "https://{{ rancher_hostname }}"
+      grafana:
+        # https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml
+        grafana.ini:
+          server:
+            domain: "{{ rancher_hostname }}"
+            root_url: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-grafana:80/proxy/"
+        persistence:
+          accessModes:
+            - ReadWriteOnce
+          enabled: true
+          size: 1Gi
+          storageClassName: mayastor-2replicas
+          type: pvc
+        sidecar:
+          dashboards:
+            enabled: true
+            searchNamespace: ALL
+      kube-state-metrics:
+        metricLabelsAllowlist:
+          - pods=[*]
+          - deployments=[app.kubernetes.io/name,app.kubernetes.io/component,app.kubernetes.io/instance]
+      prometheus:
+        serviceMonitorSelectorNilUsesHelmValues: false
+        podMonitorSelectorNilUsesHelmValues: false
+        prometheusSpec:
+          externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-prometheus:9090/proxy"
+          retentionSize: 10GiB
+          scrapeInterval: 60s
+          resources:
+            limits:
+              memory: 6000Mi
+              cpu: 2000m
+            requests:
+              memory: 3000Mi
+              cpu: 1500m
+          storageSpec:
+            volumeClaimTemplate:
+              spec:
+                accessModes:
+                  - ReadWriteOnce
+                resources:
+                  requests:
+                    storage: 20Gi
+                storageClassName: mayastor-2replicas
+      alertmanager:
+        alertmanagerSpec:
+          externalUrl: "https://{{ rancher_hostname }}/api/v1/namespaces/cattle-monitoring-system/services/http:rancher-monitoring-alertmanager:9093/proxy"
+          logLevel: debug
--- a/roles/k8s_traefik/meta/main.yml
+++ b/roles/k8s_traefik/meta/main.yml
--- a/roles/k8s_traefik/tasks/main.yml
+++ b/roles/k8s_traefik/tasks/main.yml
--- a/roles/k8s_velero/meta/main.yml
+++ b/roles/k8s_velero/meta/main.yml
--- a/roles/k8s_velero/tasks/main.yml
+++ b/roles/k8s_velero/tasks/main.yml
--- a/roles/os_debian/defaults/main.yml
+++ b/roles/os_debian/defaults/main.yml
--- a/roles/os_debian/files/security/limits.conf
+++ b/roles/os_debian/files/security/limits.conf
--- a/roles/os_debian/files/systemd/journald.conf.d/persistent.conf
+++ b/roles/os_debian/files/systemd/journald.conf.d/persistent.conf
--- a/roles/os_debian/files/usr/local/bin/apt-get-noninteractive
+++ b/roles/os_debian/files/usr/local/bin/apt-get-noninteractive
--- a/roles/os_debian/meta/main.yml
+++ b/roles/os_debian/meta/main.yml
@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: common
--- a/roles/os_debian/tasks/ansible_remote.yml
+++ b/roles/os_debian/tasks/ansible_remote.yml
--- a/roles/os_debian/tasks/apt.yml
+++ b/roles/os_debian/tasks/apt.yml
--- a/roles/os_debian/tasks/bloatware.yml
+++ b/roles/os_debian/tasks/bloatware.yml
--- a/roles/os_debian/tasks/environment.yml
+++ b/roles/os_debian/tasks/environment.yml
--- a/roles/os_debian/tasks/firewall.yml
+++ b/roles/os_debian/tasks/firewall.yml
--- a/roles/os_debian/tasks/limits.yml
+++ b/roles/os_debian/tasks/limits.yml
--- a/roles/os_debian/tasks/local-ssh.yml
+++ b/roles/os_debian/tasks/local-ssh.yml
--- a/roles/os_debian/tasks/mail_postfix.yml
+++ b/roles/os_debian/tasks/mail_postfix.yml
--- a/roles/os_debian/tasks/mail_ssmtp.yml
+++ b/roles/os_debian/tasks/mail_ssmtp.yml
--- a/roles/os_debian/tasks/main.yml
+++ b/roles/os_debian/tasks/main.yml
--- a/roles/os_debian/tasks/ntp-timesync.yml
+++ b/roles/os_debian/tasks/ntp-timesync.yml
@ -0,0 +1,16 @@
+---
+- name: Wait for APT Lock
+  ansible.builtin.include_tasks: shared/tasks/wait_apt.yml
+
+- name: Uninstall chrony
+  ansible.builtin.apt:
+    name:
+      - chrony
+    state: absent
+    install_recommends: false
+
+- name: Install systemd-timesyncd
+  ansible.builtin.apt:
+    name:
+      - systemd-timesyncd
+    install_recommends: false
--- a/roles/os_debian/tasks/ssh.yml
+++ b/roles/os_debian/tasks/ssh.yml
--- a/roles/os_debian/tasks/sysadmin-tools.yml
+++ b/roles/os_debian/tasks/sysadmin-tools.yml
--- a/roles/os_debian/tasks/systemd.yml
+++ b/roles/os_debian/tasks/systemd.yml
--- a/roles/os_debian/templates/postfix/main.cf.j2
+++ b/roles/os_debian/templates/postfix/main.cf.j2
--- a/roles/os_debian/templates/profile.d/100-ansible.sh.j2
+++ b/roles/os_debian/templates/profile.d/100-ansible.sh.j2
--- a/roles/os_debian/templates/ssmtp/ssmtp.conf.j2
+++ b/roles/os_debian/templates/ssmtp/ssmtp.conf.j2
--- a/roles/os_ubuntu/handlers/main.yml
+++ b/roles/os_ubuntu/handlers/main.yml
--- a/roles/os_ubuntu/tasks/apt.yml
+++ b/roles/os_ubuntu/tasks/apt.yml
--- a/roles/os_ubuntu/tasks/main.yml
+++ b/roles/os_ubuntu/tasks/main.yml
--- a/roles/talos_config_apply/tasks/main.yml
+++ b/roles/talos_config_apply/tasks/main.yml
@ -10,8 +10,27 @@
 - name: Import talos_machine_status tasks
  ansible.builtin.import_tasks: "{{ role_path }}/../../shared/tasks/talos_machine_status.yml"

+# 'mode=try' does not work when the machine is in Maintenance mode.
+- name: Dry run apply Talos node config
+  delegate_to: "{{ talosctl_host }}"
+  become: false
+  throttle: 1
+  ansible.builtin.command:
+    cmd: >-
+      talosctl  apply-config
+        --dry-run
+        --file '{{ talos_node_config_file }}'
+        --nodes '{{ ansible_host }}'
+        --endpoints '{{ ansible_host }}'
+        {% if talos_machine_status.spec.stage == 'maintenance' %} --insecure{% endif %}
+  changed_when: false
+  environment:
+    TALOSCONFIG: "{{ talosconfig }}"
+  register: _talos_node_config_dry_run
+
 # 'mode=try' does not work when the machine is in Maintenance mode.
 - name: Apply Talos node config
+  when: '"No changes" not in _talos_node_config_dry_run.stderr'
  delegate_to: "{{ talosctl_host }}"
  become: false
  throttle: 1
--- a/roles/talos_config_create/tasks/create_node_config.yml
+++ b/roles/talos_config_create/tasks/create_node_config.yml
@ -7,7 +7,8 @@
        network:
          hostname: "{{ inventory_hostname }}"
          interfaces:
-            - interface: "{{ network_interface }}"
+            - deviceSelector:
+                busPath: "00:*"
              addresses:
                - "{{ ansible_host }}/{{ network_cidr_prefix }}"
              routes:
@ -15,23 +16,33 @@
                  gateway: "{{ (ansible_host ~ '/' ~ network_cidr_prefix) | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}"
        install:
          disk: "{{ talos_disk }}"
+          image: "ghcr.io/siderolabs/installer:{{ talos_version }}"
        kubelet:
          extraArgs:
            max-pods: "{% if 'talos_hardware_nodes' in group_names %}250{% else %}110{% endif %}"

+
 - name: Create Talos interfaces bond append
  when: "network_interface_bond is defined"
  ansible.utils.update_fact:
    updates:
      - path: "_talos_override_config.machine.network.interfaces.0.bond"
        value: "{{ network_interface_bond }}"
+      - path: "_talos_override_config.machine.network.interfaces.0.interface"
+        value: "bond0"
+  changed_when: false
  register: _talos_override_update

- name: Apply Talos interfaces bond append
+- name: Apply Talos interfaces bond append on override config
  when: "network_interface_bond is defined"
  ansible.builtin.set_fact:
    _talos_override_config: "{{ _talos_override_update._talos_override_config }}"

+- name: Remove deviceSelector
+  when: "network_interface_bond is defined"
+  ansible.builtin.set_fact:
+    _talos_override_config: "{{ _talos_override_config | ansible.utils.remove_keys(target=['deviceSelector']) }}"
+
 - name: Create temp directory
  delegate_to: "{{ talosctl_host }}"
  ansible.builtin.file:
@ -68,4 +79,4 @@
        --force
        --with-docs=false
        --with-examples=false
-  changed_when: true
+  changed_when: false
--- a/roles/talos_machine_status/meta/main.yml
+++ b/roles/talos_machine_status/meta/main.yml
@ -0,0 +1,3 @@
+---
+dependencies:
+  - role: common
--- a/roles/talos_machine_status/tasks/main.yml
+++ b/roles/talos_machine_status/tasks/main.yml
--- a/roles/talos_reset/tasks/main.yml
+++ b/roles/talos_reset/tasks/main.yml
@ -12,8 +12,9 @@
    cmd: >-
      talosctl  reset
        --graceful=false
-        --wait=true
+        --system-labels-to-wipe=STATE,EPHEMERAL
        --reboot
+        --wait=true
        --endpoints '{{ ansible_host }}'
        --nodes '{{ ansible_host }}'
  changed_when: true
--- a/roles/talos_wait_port/tasks/main.yml
+++ b/roles/talos_wait_port/tasks/main.yml
@ -6,3 +6,4 @@
  ansible.builtin.wait_for:
    host: "{{ ansible_host }}"
    port: 50000
+    timeout: 600
--- a/roles/vps_names/tasks/main.yml
+++ b/roles/vps_names/tasks/main.yml
@ -1,4 +0,0 @@
---
-
-# @TODO  https://api2.hostbillapp.com/accounts/searchServiceByIP.html
-
--- a/talos.yml
+++ b/talos.yml
@ -1,4 +1,4 @@
-# Playbook Talos Install
+# Talos Install Playbook
 ---
 - name: Cloudflare DNS
  become: false
@ -22,16 +22,6 @@
        - talos_wait_port
        - talos_config_apply

- name: Talos hardware nodes
-  become: false
-  gather_facts: false
-  hosts:
-    - talos_hardware_nodes
-  roles:
-    - role: dci_finish
-      tags:
-        - dci_finish
-
 - name: Talos config
  become: false
  gather_facts: false
@ -52,3 +42,13 @@
    - talos_first_nodes
  roles:
    - role: talos_bootstrap
+
+- name: Talos hardware nodes
+  become: false
+  gather_facts: false
+  hosts:
+    - talos_hardware_nodes
+  roles:
+    - role: dci_finish
+      tags:
+        - dci_finish
--- a/talos_machine_status.yml
+++ b/talos_machine_status.yml
@ -0,0 +1,8 @@
+---
+
+- name: Role talos_machine_status
+  gather_facts: false
+  hosts:
+    - talos
+  roles:
+    - role: talos_machine_status