diff --git a/group_vars/all b/group_vars/all index b2c3bf8..863d28b 100644 --- a/group_vars/all +++ b/group_vars/all @@ -1,22 +1,39 @@ --- # Variables listed here are applicable to all host groups +# Software versions docker_compose_version_to_install: 1.22.0 docker_ce_version_to_install: 18.03.1 nvm_version: v0.33.5 node_version: 8.4.0 + +# Storage Stuff registry_location: "registry.wimer.home" nfs_location: 10.0.0.150 nfs_share: /volumeUSB1/usbshare + +# Machine configurations home_pub_key: https://raw.githubusercontent.com/jcwimer/ubuntu-template/master/post/id_rsa.pub standard_user: cody git_user: "Jacob Cody Wimer" git_email: "jacob.wimer@gmail.com" + +# Proxmox Settings proxmox_user: "root@pam" proxmox_password: "{{ lookup('env', 'PROXMOX_PASSWORD') }}" ubuntu_template_vm_name: "ubuntu-server-1604" + +# dns +domain: wimer.home dns_server: "10.0.0.204" # haproxies vip_interface: ens18 vip_address: 10.0.0.200 + +# rke +rke_directory: /root/rke +rke_version: 0.2.1 +rke_ssh_key_location: /root/id_home +rke_nfs_path: "{{ nfs_share }}/raw-files/fileserver/shares/lab-data/kubernetes" +alertmanager_email_password: "{{ lookup('env', 'GMAIL_SERVICE_PASSWORD') }}" diff --git a/playbooks/site.yml b/playbooks/site.yml index c33a76c..cba10ed 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -34,6 +34,12 @@ tasks: - include: ../roles/developer-machine/tasks/main.yml +- name: Set up Kubernetes + hosts: localhost + user: root + tasks: + - include: ../roles/kubernetes/tasks/main.yml + - name: Initialize the swarm hosts: swarm-bootstrap user: root diff --git a/roles/kubernetes/tasks/main.yml b/roles/kubernetes/tasks/main.yml new file mode 100644 index 0000000..8e89889 --- /dev/null +++ b/roles/kubernetes/tasks/main.yml @@ -0,0 +1,65 @@ +--- +- name: Create RKE directory + file: + path: "{{ rke_directory }}" + state: directory + delegate_to: localhost + run_once: true + +- name: Create RKE Configs directory + file: + path: "{{ rke_directory }}/configs" + state: directory + delegate_to: localhost + run_once: true + +- name: Install RKE + get_url: + dest: "{{ rke_directory }}/rke" + url: https://github.com/rancher/rke/releases/download/v{{ rke_version }}/rke_linux-amd64 + delegate_to: localhost + run_once: true + +- name: Make RKE executable + file: + dest: "{{ rke_directory }}/rke" + mode: +x + delegate_to: localhost + run_once: true + +- name: Put RKE cluster config in place + template: + src: ../templates/rke-cluster.yaml.j2 + dest: "{{ rke_directory }}/rke-cluster.yaml" + delegate_to: localhost + run_once: true + +- name: Put RKE configs in place + template: + src: ../templates/rke-configs/{{ item }}.j2 + dest: "{{ rke_directory }}/configs/{{ item }}" + with_items: + - nfs-client-deployment.yaml + - nfs-client-rbac.yaml + - nfs-client-storageclass.yaml + - alertmanager-pvc.yaml + - alertmanager-configmap.yaml + - alertmanager-deployment.yaml + - alertmanager-service.yaml + - kube-state-metrics-deployment.yaml + - kube-state-metrics-service.yaml + - kube-state-metrics-rbac.yaml + - node-exporter.yaml + - prometheus-configmap.yaml + - prometheus-rbac.yaml + - prometheus-statefulset.yaml + - prometheus-service.yaml + - monitoring-ingress.yaml + delegate_to: localhost + run_once: true + +- name: Run RKE + shell: > + bash -c "{{ rke_directory }}/rke up --config {{ rke_directory }}/rke-cluster.yaml" + delegate_to: localhost + run_once: true diff --git a/roles/kubernetes/templates/rke-cluster.yaml.j2 b/roles/kubernetes/templates/rke-cluster.yaml.j2 new file mode 100644 index 0000000..c750977 --- /dev/null +++ b/roles/kubernetes/templates/rke-cluster.yaml.j2 @@ -0,0 +1,52 @@ +--- + +ssh_key_path: {{ rke_ssh_key_location }} + +cluster_name: rke-k8s +ignore_docker_version: true +kubernetes_version: v1.13.4-rancher1-2 +system_images: + kubernetes: rancher/hyperkube:v1.13.4-rancher1 + +nodes: + {% for node in groups['kube-masters'] %} + + - address: {{ hostvars[node]['ansible_host'] }} + name: {{node}} + user: {{standard_user}} + role: + - controlplane + - etcd + {% endfor %} + {% for node in groups['kube-workers'] %} + + - address: {{ hostvars[node]['ansible_host'] }} + name: {{node}} + user: {{standard_user}} + role: + - worker + {% endfor %} + +authentication: + strategy: x509 + sans: + - "{{ vip_address }}" + - "kube.{{ domain }}" + +addons_include: + - ./configs/nfs-client-deployment.yaml + - ./configs/nfs-client-rbac.yaml + - ./configs/nfs-client-storageclass.yaml + - ./configs/alertmanager-pvc.yaml + - ./configs/alertmanager-configmap.yaml + - ./configs/alertmanager-deployment.yaml + - ./configs/alertmanager-service.yaml + - ./configs/kube-state-metrics-deployment.yaml + - ./configs/kube-state-metrics-service.yaml + - ./configs/kube-state-metrics-rbac.yaml + - ./configs/node-exporter.yaml + - ./configs/prometheus-configmap.yaml + - ./configs/prometheus-rbac.yaml + - ./configs/prometheus-statefulset.yaml + - ./configs/prometheus-service.yaml + - ./configs/monitoring-ingress.yaml diff --git a/roles/kubernetes/templates/rke-configs/alertmanager-configmap.yaml.j2 b/roles/kubernetes/templates/rke-configs/alertmanager-configmap.yaml.j2 new file mode 100644 index 0000000..471cb4a --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/alertmanager-configmap.yaml.j2 @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-config + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: EnsureExists +data: + alertmanager.yml: | + global: null + receivers: + - name: default-receiver + - name: email + email_configs: + - to: jacob.wimer@gmail.com + from: jacob.wimer@gmail.com + smarthost: smtp.gmail.com:587 + auth_username: "jacob.wimer@gmail.com" + auth_identity: "jacob.wimer@gmail.com" + auth_password: {{ alertmanager_email_password }} + send_resolved: true + + route: + group_interval: 5m + group_wait: 10s + receiver: email + repeat_interval: 3h diff --git a/roles/kubernetes/templates/rke-configs/alertmanager-deployment.yaml.j2 b/roles/kubernetes/templates/rke-configs/alertmanager-deployment.yaml.j2 new file mode 100644 index 0000000..744d9ad --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/alertmanager-deployment.yaml.j2 @@ -0,0 +1,79 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: kube-system + labels: + k8s-app: alertmanager + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile + version: v0.14.0 +spec: + replicas: 1 + selector: + matchLabels: + k8s-app: alertmanager + version: v0.14.0 + template: + metadata: + labels: + k8s-app: alertmanager + version: v0.14.0 + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-cluster-critical + containers: + - name: prometheus-alertmanager + image: "prom/alertmanager:v0.14.0" + imagePullPolicy: "IfNotPresent" + args: + - --config.file=/etc/config/alertmanager.yml + - --storage.path=/data + - --web.external-url=/ + ports: + - containerPort: 9093 + readinessProbe: + httpGet: + path: /#/status + port: 9093 + initialDelaySeconds: 30 + timeoutSeconds: 30 + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: storage-volume + mountPath: "/data" + subPath: "" + resources: + limits: + cpu: 10m + memory: 50Mi + requests: + cpu: 10m + memory: 50Mi + - name: prometheus-alertmanager-configmap-reload + image: "jimmidyson/configmap-reload:v0.1" + imagePullPolicy: "IfNotPresent" + args: + - --volume-dir=/etc/config + - --webhook-url=http://localhost:9093/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + resources: + limits: + cpu: 10m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + volumes: + - name: config-volume + configMap: + name: alertmanager-config + - name: storage-volume + persistentVolumeClaim: + claimName: alertmanager + diff --git a/roles/kubernetes/templates/rke-configs/alertmanager-pvc.yaml.j2 b/roles/kubernetes/templates/rke-configs/alertmanager-pvc.yaml.j2 new file mode 100644 index 0000000..b82b12e --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/alertmanager-pvc.yaml.j2 @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: alertmanager + namespace: kube-system + labels: + app: alertmanager +spec: + storageClassName: standard + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi diff --git a/roles/kubernetes/templates/rke-configs/alertmanager-service.yaml.j2 b/roles/kubernetes/templates/rke-configs/alertmanager-service.yaml.j2 new file mode 100644 index 0000000..62c7b59 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/alertmanager-service.yaml.j2 @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile + kubernetes.io/name: "Alertmanager" +spec: + ports: + - name: http + port: 80 + protocol: TCP + targetPort: 9093 + selector: + k8s-app: alertmanager + type: "ClusterIP" diff --git a/roles/kubernetes/templates/rke-configs/kube-state-metrics-deployment.yaml.j2 b/roles/kubernetes/templates/rke-configs/kube-state-metrics-deployment.yaml.j2 new file mode 100644 index 0000000..823696b --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/kube-state-metrics-deployment.yaml.j2 @@ -0,0 +1,92 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + k8s-app: kube-state-metrics + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile + version: v1.3.0 +spec: + selector: + matchLabels: + k8s-app: kube-state-metrics + version: v1.3.0 + replicas: 1 + template: + metadata: + labels: + k8s-app: kube-state-metrics + version: v1.3.0 + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-cluster-critical + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.3.0 + ports: + - name: http-metrics + containerPort: 8080 + - name: telemetry + containerPort: 8081 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + - name: addon-resizer + image: k8s.gcr.io/addon-resizer:1.8.4 + resources: + limits: + cpu: 100m + memory: 30Mi + requests: + cpu: 100m + memory: 30Mi + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: config-volume + mountPath: /etc/config + command: + - /pod_nanny + - --config-dir=/etc/config + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=1m + - --memory=100Mi + - --extra-memory=2Mi + - --threshold=5 + - --deployment=kube-state-metrics + volumes: + - name: config-volume + configMap: + name: kube-state-metrics-config +--- +# Config map for resource configuration. +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-state-metrics-config + namespace: kube-system + labels: + k8s-app: kube-state-metrics + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +data: + NannyConfiguration: |- + apiVersion: nannyconfig/v1alpha1 + kind: NannyConfiguration + + diff --git a/roles/kubernetes/templates/rke-configs/kube-state-metrics-rbac.yaml.j2 b/roles/kubernetes/templates/rke-configs/kube-state-metrics-rbac.yaml.j2 new file mode 100644 index 0000000..6eb2981 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/kube-state-metrics-rbac.yaml.j2 @@ -0,0 +1,104 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +rules: +- apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: kube-state-metrics-resizer + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +rules: +- apiGroups: [""] + resources: + - pods + verbs: ["get"] +- apiGroups: ["extensions"] + resources: + - deployments + resourceNames: ["kube-state-metrics"] + verbs: ["get", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system + diff --git a/roles/kubernetes/templates/rke-configs/kube-state-metrics-service.yaml.j2 b/roles/kubernetes/templates/rke-configs/kube-state-metrics-service.yaml.j2 new file mode 100644 index 0000000..bad3ffd --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/kube-state-metrics-service.yaml.j2 @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile + kubernetes.io/name: "kube-state-metrics" + annotations: + prometheus.io/scrape: 'true' +spec: + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + protocol: TCP + - name: telemetry + port: 8081 + targetPort: telemetry + protocol: TCP + selector: + k8s-app: kube-state-metrics diff --git a/roles/kubernetes/templates/rke-configs/monitoring-ingress.yaml.j2 b/roles/kubernetes/templates/rke-configs/monitoring-ingress.yaml.j2 new file mode 100644 index 0000000..feb13f3 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/monitoring-ingress.yaml.j2 @@ -0,0 +1,32 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + namespace: kube-system + name: alertmanager + #annotations: + # kubernetes.io/ingress.class: traefik +spec: + rules: + - host: alertmanager.{{ domain }} + http: + paths: + - backend: + serviceName: alertmanager + servicePort: 80 +--- +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + namespace: kube-system + name: prometheus + #annotations: + # kubernetes.io/ingress.class: traefik +spec: + rules: + - host: prometheus.{{ domain }} + http: + paths: + - backend: + serviceName: prometheus + servicePort: 9090 + diff --git a/roles/kubernetes/templates/rke-configs/nfs-client-deployment.yaml.j2 b/roles/kubernetes/templates/rke-configs/nfs-client-deployment.yaml.j2 new file mode 100644 index 0000000..3557fe2 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/nfs-client-deployment.yaml.j2 @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nfs-client-provisioner +--- +kind: Deployment +apiVersion: extensions/v1beta1 +metadata: + name: nfs-client-provisioner +spec: + replicas: 1 + strategy: + type: Recreate + template: + metadata: + labels: + app: nfs-client-provisioner + spec: + serviceAccountName: nfs-client-provisioner + containers: + - name: nfs-client-provisioner + image: quay.io/external_storage/nfs-client-provisioner:latest + volumeMounts: + - name: nfs-client-root + mountPath: /persistentvolumes + env: + - name: PROVISIONER_NAME + value: {{ domain }}/nfs + - name: NFS_SERVER + value: {{ nfs_location }} + - name: NFS_PATH + value: {{ rke_nfs_path }} + volumes: + - name: nfs-client-root + nfs: + server: {{ nfs_location }} + path: {{ rke_nfs_path }} diff --git a/roles/kubernetes/templates/rke-configs/nfs-client-rbac.yaml.j2 b/roles/kubernetes/templates/rke-configs/nfs-client-rbac.yaml.j2 new file mode 100644 index 0000000..8c8c640 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/nfs-client-rbac.yaml.j2 @@ -0,0 +1,58 @@ +kind: ServiceAccount +apiVersion: v1 +metadata: + name: nfs-client-provisioner +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: nfs-client-provisioner-runner +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "update", "patch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: run-nfs-client-provisioner +subjects: + - kind: ServiceAccount + name: nfs-client-provisioner + namespace: kube-system +roleRef: + kind: ClusterRole + name: nfs-client-provisioner-runner + apiGroup: rbac.authorization.k8s.io +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: leader-locking-nfs-client-provisioner +rules: + - apiGroups: [""] + resources: ["endpoints"] + verbs: ["get", "list", "watch", "create", "update", "patch"] +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: leader-locking-nfs-client-provisioner +subjects: + - kind: ServiceAccount + name: nfs-client-provisioner + # replace with namespace where provisioner is deployed + namespace: kube-system +roleRef: + kind: Role + name: leader-locking-nfs-client-provisioner + apiGroup: rbac.authorization.k8s.io diff --git a/roles/kubernetes/templates/rke-configs/nfs-client-storageclass.yaml.j2 b/roles/kubernetes/templates/rke-configs/nfs-client-storageclass.yaml.j2 new file mode 100644 index 0000000..c81654d --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/nfs-client-storageclass.yaml.j2 @@ -0,0 +1,7 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: standard +provisioner: {{ domain }}/nfs # or choose another name, must match deployment's env PROVISIONER_NAME' +parameters: + archiveOnDelete: "false" diff --git a/roles/kubernetes/templates/rke-configs/node-exporter.yaml.j2 b/roles/kubernetes/templates/rke-configs/node-exporter.yaml.j2 new file mode 100644 index 0000000..3420285 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/node-exporter.yaml.j2 @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: kube-system + annotations: + prometheus.io/scrape: 'true' + labels: + app: node-exporter + name: node-exporter + name: node-exporter +spec: + clusterIP: None + ports: + - name: scrape + port: 9100 + protocol: TCP + selector: + app: node-exporter + type: ClusterIP +--- +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + namespace: kube-system + name: node-exporter +spec: + template: + metadata: + labels: + app: node-exporter + name: node-exporter + spec: + containers: + - image: prom/node-exporter + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: scrape + hostNetwork: true + hostPID: true + diff --git a/roles/kubernetes/templates/rke-configs/prometheus-configmap.yaml.j2 b/roles/kubernetes/templates/rke-configs/prometheus-configmap.yaml.j2 new file mode 100644 index 0000000..1f7da47 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/prometheus-configmap.yaml.j2 @@ -0,0 +1,236 @@ +# Prometheus configuration format https://prometheus.io/docs/prometheus/latest/configuration/configuration/ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: EnsureExists +data: + rules.yml: | + # raw to endraw is so jija does not fail with prometheus config's double curly brackets in it's syntax + # {% raw %} + groups: + # node-exporter + - name: alert.rules_nodes + rules: + - alert: high_memory_usage_on_node + expr: ((node_memory_MemTotal - node_memory_MemAvailable) / node_memory_MemTotal) + * 100 > 80 + for: 5m + annotations: + description: '{{ $labels.host }} is using a LOT of MEMORY. MEMORY usage is over + {{ humanize $value}}%.' + summary: HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.host }}' + - alert: high_la_usage_on_node + expr: node_load5 > 7 + for: 5m + annotations: + description: '{{ $labels.host }} has a high load average. Load Average 5m is + {{ humanize $value}}.' + summary: HIGH LOAD AVERAGE WARNING ON '{{ $labels.host }}' + - alert: node_running_out_of_disk_space + expr: (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) + * 100 / node_filesystem_size{mountpoint="/"} > 80 + for: 5m + annotations: + description: More than 80% of disk used. Disk usage {{ humanize $value }}%. + summary: 'LOW DISK SPACE WARING: NODE ''{{ $labels.host }}''' + - alert: monitoring_service_down + expr: up == 0 + for: 90s + annotations: + description: "The monitoring service '{{ $labels.job }}' is down." + summary: "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels.host }}'" + # {% endraw %} + + # ceph + - name: alert.rules_ceph + rules: + - alert: ceph_health_warning + expr: ceph_health_status == 1 + for: 5m + annotations: + description: CEPH CLUSTER HEALTH WARNING + summary: CEPH CLUSTER HEALTH WARNING + + prometheus.yml: | + alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager"] + rule_files: + - "rules.yml" + scrape_configs: + #- job_name: 'ceph' + # static_configs: + # - targets: + # - 'rook-ceph-mgr-external:9283' + - job_name: 'kubernetes-node-exporter' + dns_sd_configs: + - names: + - 'node-exporter' + type: 'A' + port: 9100 + + - job_name: prometheus + static_configs: + - targets: + - localhost:9090 + + - job_name: kubernetes-apiservers + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: default;kubernetes;https + source_labels: + - __meta_kubernetes_namespace + - __meta_kubernetes_service_name + - __meta_kubernetes_endpoint_port_name + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + - job_name: kubernetes-nodes-kubelet + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + - job_name: kubernetes-nodes-cadvisor + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __metrics_path__ + replacement: /metrics/cadvisor + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + - job_name: kubernetes-service-endpoints + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scrape + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_service_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + + - job_name: kubernetes-services + kubernetes_sd_configs: + - role: service + metrics_path: /probe + params: + module: + - http_2xx + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_service_annotation_prometheus_io_probe + - source_labels: + - __address__ + target_label: __param_target + - replacement: blackbox + target_label: __address__ + - source_labels: + - __param_target + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - source_labels: + - __meta_kubernetes_service_name + target_label: kubernetes_name + + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: kubernetes_namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: kubernetes_pod_name + #alerting: + # alertmanagers: + # - kubernetes_sd_configs: + # - role: pod + # tls_config: + # ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + # relabel_configs: + # - source_labels: [__meta_kubernetes_namespace] + # regex: kube-system + # action: keep + # - source_labels: [__meta_kubernetes_pod_label_k8s_app] + # regex: alertmanager + # action: keep + # - source_labels: [__meta_kubernetes_pod_container_port_number] + # regex: + # action: drop + diff --git a/roles/kubernetes/templates/rke-configs/prometheus-rbac.yaml.j2 b/roles/kubernetes/templates/rke-configs/prometheus-rbac.yaml.j2 new file mode 100644 index 0000000..1961730 --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/prometheus-rbac.yaml.j2 @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: kube-system + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +rules: + - apiGroups: + - "" + resources: + - nodes + - nodes/metrics + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - nonResourceURLs: + - "/metrics" + verbs: + - get +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus + labels: + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus + namespace: kube-system + diff --git a/roles/kubernetes/templates/rke-configs/prometheus-service.yaml.j2 b/roles/kubernetes/templates/rke-configs/prometheus-service.yaml.j2 new file mode 100644 index 0000000..928bb4a --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/prometheus-service.yaml.j2 @@ -0,0 +1,18 @@ +kind: Service +apiVersion: v1 +metadata: + name: prometheus + namespace: kube-system + labels: + kubernetes.io/name: "Prometheus" + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile +spec: + ports: + - name: http + port: 9090 + protocol: TCP + targetPort: 9090 + selector: + k8s-app: prometheus + diff --git a/roles/kubernetes/templates/rke-configs/prometheus-statefulset.yaml.j2 b/roles/kubernetes/templates/rke-configs/prometheus-statefulset.yaml.j2 new file mode 100644 index 0000000..437b5bd --- /dev/null +++ b/roles/kubernetes/templates/rke-configs/prometheus-statefulset.yaml.j2 @@ -0,0 +1,110 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: kube-system + labels: + k8s-app: prometheus + kubernetes.io/cluster-service: "true" + addonmanager.kubernetes.io/mode: Reconcile + version: v2.2.1 +spec: + serviceName: "prometheus" + replicas: 1 + podManagementPolicy: "Parallel" + updateStrategy: + type: "RollingUpdate" + selector: + matchLabels: + k8s-app: prometheus + template: + metadata: + labels: + k8s-app: prometheus + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' + spec: + priorityClassName: system-cluster-critical + serviceAccountName: prometheus + initContainers: + - name: "init-chown-data" + image: "busybox:latest" + imagePullPolicy: "IfNotPresent" + command: ["chown", "-R", "65534:65534", "/data"] + volumeMounts: + - name: prometheus-data + mountPath: /data + subPath: "" + containers: + - name: prometheus-server-configmap-reload + image: "jimmidyson/configmap-reload:v0.1" + imagePullPolicy: "IfNotPresent" + args: + - --volume-dir=/etc/config + - --webhook-url=http://localhost:9090/-/reload + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + resources: + limits: + cpu: 10m + memory: 10Mi + requests: + cpu: 10m + memory: 10Mi + + - name: prometheus-server + image: "prom/prometheus:v2.2.1" + imagePullPolicy: "IfNotPresent" + args: + - --config.file=/etc/config/prometheus.yml + - --storage.tsdb.path=/data + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + - --web.enable-lifecycle + ports: + - containerPort: 9090 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 30 + timeoutSeconds: 30 + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + timeoutSeconds: 30 + # based on 10 running nodes with 30 pods each + resources: + limits: + cpu: 200m + memory: 1000Mi + requests: + cpu: 200m + memory: 1000Mi + + volumeMounts: + - name: config-volume + mountPath: /etc/config + - name: prometheus-data + mountPath: /data + subPath: "" + terminationGracePeriodSeconds: 300 + volumes: + - name: config-volume + configMap: + name: prometheus-config + volumeClaimTemplates: + - metadata: + name: prometheus-data + spec: + storageClassName: standard + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "16Gi" + diff --git a/supporting-scripts/site.sh b/supporting-scripts/site.sh index c7506d9..f3176fd 100755 --- a/supporting-scripts/site.sh +++ b/supporting-scripts/site.sh @@ -1,6 +1,38 @@ #!/bin/bash #keep adding dirname's to go up more directories. project_dir="$(dirname $( dirname $(readlink -f ${BASH_SOURCE[0]})))" -#ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/lxc-test.yml -ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/kvm.yml -ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/site.yml + +declare -ar REQUIRED_ENVIRONMENT_VARIABLES=( + "PROXMOX_PASSWORD" + "GMAIL_SERVICE_PASSWORD" +) + +main() { + check-env + run-ansible +} + +check-env() { + local -a undefined_variables=() + + for var in "${REQUIRED_ENVIRONMENT_VARIABLES[@]}"; do + if [[ ! -v ${var} ]]; then + undefined_variables+=("${var}") + fi + done + + if [[ "${#undefined_variables[@]}" -gt 0 ]]; then + echo "${red}ERROR: The following environment variables must be defined:" + printf ' %s\n' "${undefined_variables[@]}" + echo "${reset}" + exit 1 + fi +} + +run-ansible() { + #ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/lxc-test.yml + ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/kvm.yml + ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -i ${project_dir}/hosts ${project_dir}/playbooks/site.yml +} + +[[ $0 == "${BASH_SOURCE}" ]] && main "$@"