Thank you for reading this post, don't forget to subscribe!
когда вы используете karpenter и жёстко не прибиваете ami ваших instance, то при их обновлении со стороны aws происходит drift и karpenter обновляет все ноды, чтобы отслеживать это и создавать алерт, а так же jira task был сделан следующий модуль, который будет мониторить cloudwatch и с помощью lambda отправлять оповещения.
а данные в cloudwatch будем отправлять запуская fluentbit только на тех нодах где karpenter
/TEST/infra/terraform-module/cloudwatch-karpenter-log/main.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
terraform { required_version = ">= 0.13" # required_providers { kubectl = { source = "gavinbunney/kubectl" version = ">= 1.14.0" } } } resource "kubernetes_namespace" "fluent_bit" { metadata { name = "cloudwatch-logs" } } resource "random_id" "configmap" { byte_length = 8 keepers = { data = templatefile("${path.module}/templates/gen-configmap.yaml.tftpl", { cluster_name = var.cluster_name http_port = var.http_port http_server = var.http_server logs_region = var.logs_region read_head = var.read_head read_tail = var.read_tail }) } } resource "kubernetes_config_map" "fluent-bit_config_map" { metadata { name = "fluent-bit-cluster-info" namespace = kubernetes_namespace.fluent_bit.metadata.0.name } data = { "cluster.name" : "${var.cluster_name}" "http.port" : "${var.http_port}" "http.server" : "${var.http_server}" "logs.region" : "${var.logs_region}" "read.head" : "${var.read_head}" "read.tail" : "${var.read_tail}" } } resource "kubectl_manifest" "clusterrole" { yaml_body = file("${path.module}/templates/clusterrole.yaml") } resource "kubectl_manifest" "clusterrolebinding" { yaml_body = templatefile("${path.module}/templates/clusterrolebinding.yaml.tftpl", { namespace = var.namespace }) } resource "kubectl_manifest" "serviceaccount" { yaml_body = templatefile("${path.module}/templates/serviceaccount.yaml.tftpl", { role_arn = var.role_arn namespace = var.namespace }) } resource "kubectl_manifest" "configmap" { yaml_body = templatefile("${path.module}/templates/configmap.yaml.tftpl", { namespace = var.namespace cloudwatch_group_name = var.cloudwatch_group_name }) } resource "kubectl_manifest" "daemonset" { yaml_body = templatefile("${path.module}/templates/daemonset.yaml.tftpl", { namespace = var.namespace }) depends_on = [ aws_cloudwatch_log_group.karpenter_log_group ] } resource "aws_cloudwatch_log_group" "karpenter_log_group" { name = var.cloudwatch_group_name retention_in_days = "3" } |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/lambda.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
data "aws_secretsmanager_secret" "slack_alerts_webhook" { name = var.secret_name } data "aws_secretsmanager_secret_version" "slack_alerts_webhook" { secret_id = data.aws_secretsmanager_secret.slack_alerts_webhook.id } data "aws_lambda_layer_version" "lambda_layer_slack_alert" { layer_name = var.layer_name } module "lambda_cloudwatch_karpenter_checker" { source = "terraform-aws-modules/lambda/aws" version = "7.7.0" publish = true function_name = var.function_name description = "Lambda function to check CloudWatch logs for drift and notify via Slack and Jira" handler = "index.lambda_handler" runtime = "python3.12" cloudwatch_logs_retention_in_days = "3" layers = [ data.aws_lambda_layer_version.lambda_layer_slack_alert.arn, ] environment_variables = { SLACK_WEBHOOK = lookup(jsondecode(data.aws_secretsmanager_secret_version.slack_alerts_webhook.secret_string), "SLACK_WEBHOOK", "") SLACK_CHANNEL = lookup(jsondecode(data.aws_secretsmanager_secret_version.slack_alerts_webhook.secret_string), "SLACK_CHANNEL", "") LOG_GROUP_NAME = var.cloudwatch_group_name CLUSTER_NAME = var.cluster_name JIRA_USER = lookup(jsondecode(data.aws_secretsmanager_secret_version.slack_alerts_webhook.secret_string), "JIRA_USER", "") JIRA_TOKEN = lookup(jsondecode(data.aws_secretsmanager_secret_version.slack_alerts_webhook.secret_string), "JIRA_TOKEN", "") } create_package = true source_path = [ { path = "${path.module}/lambdas/karpenter-alert" } ] store_on_s3 = true s3_bucket = var.lambda_bucket s3_prefix = "karpenter-alert/" attach_policy_json = true policy_json = jsonencode( { "Version" : "2012-10-17", "Statement" : [ { "Effect" : "Allow", "Action" : [ "s3:Get*", "s3:List*", "s3:Describe*", ], "Resource" : "arn:aws:s3:::${var.lambda_bucket}" }, { "Action" : [ "logs:PutLogEvents", "logs:CreateLogStream", "logs:CreateLogGroup", "logs:FilterLogEvents" ], "Effect" : "Allow", "Resource" : [ "arn:aws:logs:${var.logs_region}:${var.account_id}:log-group:${var.cloudwatch_group_name}:*:*", "arn:aws:logs:${var.logs_region}:${var.account_id}:log-group:${var.cloudwatch_group_name}:*" ] } ] } ) allowed_triggers = { EventBridge = { principal = "events.amazonaws.com" source_arn = "arn:aws:events:${var.logs_region}:${var.account_id}:rule/*" } } } |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/eventbridge.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
module "eventbridge_rule" { source = "terraform-aws-modules/eventbridge/aws" version = "3.2.2" create_bus = false create_role = false rules = { cloudwatch_karpenter_log_check = { description = "Trigger Lambda function daily to check CloudWatch logs" schedule_expression = "cron(0 5 * * ? *)" } } targets = { cloudwatch_karpenter_log_check = [ { name = var.function_name arn = module.lambda_cloudwatch_karpenter_checker.lambda_function_arn } ] } } |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/variables.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
variable "cluster_name" { } variable "http_port" { } variable "http_server" { } variable "logs_region" { } variable "read_head" { } variable "read_tail" { } variable "role_arn" { description = "The ARN of the IAM role to associate with the ServiceAccount" type = string } variable "namespace" { type = string } variable "cloudwatch_group_name" { type = string } variable "lambda_bucket" { description = "The S3 bucket to store the Lambda function" type = string } variable "script_s3_path" { description = "The S3 path to the Lambda function script" type = string } variable "secret_name" { description = "The secret ID in AWS Secrets Manager" type = string } variable "account_id" { description = "AWS account ID" type = string } variable "layer_name" { type = string } variable "function_name" { type = string } |
вот сам скрипт
/TEST/infra/terraform-module/cloudwatch-karpenter-log/lambdas/karpenter-alert/index.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import boto3 import os import json import requests from datetime import datetime, timedelta # Get environment variables slack_webhook = os.getenv('SLACK_WEBHOOK') slack_channel = os.getenv('SLACK_CHANNEL') log_group_name = os.getenv('LOG_GROUP_NAME') jira_user = os.getenv('JIRA_USER') jira_token = os.getenv('JIRA_TOKEN') cluster_name = os.getenv('CLUSTER_NAME') # Initialize Boto3 client cloudwatch_logs_client = boto3.client('logs') # Define the time range for log search end_time = int(datetime.now().timestamp() * 1000) start_time = int((datetime.now() - timedelta(days=1)).timestamp() * 1000) def find_drift_in_logs(): response = cloudwatch_logs_client.filter_log_events( logGroupName=log_group_name, startTime=start_time, endTime=end_time, filterPattern='drift' ) if response['events']: send_slack_notification() create_jira_task() def send_slack_notification(): slack_message = { 'channel': slack_channel, 'text': f'Drift detected in CloudWatch logs for {log_group_name}.' } response = requests.post(slack_webhook, data=json.dumps(slack_message)) if response.status_code != 200: raise ValueError(f'Request to Slack returned an error {response.status_code}, the response is:\n{response.text}') def create_jira_task(): jira_url = 'https://company-test.atlassian.net/rest/api/3/issue' jira_headers = { 'Accept': 'application/json', 'Content-Type': 'application/json' } jira_payload = { "fields": { "project": {"key": "OPS"}, "summary": "Drift detected in CloudWatch logs {cluster_name}", "description": { "type": "doc", "version": 1, "content": [ { "type": "paragraph", "content": [ { "type": "text", "text": f"Drift detected in cluster {cluster_name}. Update ami in the file infra/infrastructure/aws/{cluster_name}/variables/{cluster_name}.yaml Variable name: instance_ami." } ] } ] }, "issuetype": {"name": "Task"} } } response = requests.post(jira_url, headers=jira_headers, data=json.dumps(jira_payload), auth=(jira_user, jira_token)) if response.status_code != 201: raise ValueError(f'Request to Jira returned an error {response.status_code}, the response is:\n{response.text}') def lambda_handler(event, context): find_drift_in_logs() if __name__ == "__main__": lambda_handler(None, None) |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/clusterrole.yaml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: fluent-bit-role rules: - nonResourceURLs: - /metrics verbs: - get - apiGroups: [""] resources: - namespaces - pods - pods/logs verbs: ["get", "list", "watch"] |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/clusterrolebinding.yaml.tftpl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
--- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: fluent-bit-role-binding roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: fluent-bit-role subjects: - kind: ServiceAccount name: fluent-bit namespace: ${namespace} |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/configmap.yaml.tftpl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
--- apiVersion: v1 kind: ConfigMap metadata: name: fluent-bit-config namespace: ${namespace} labels: k8s-app: fluent-bit data: fluent-bit.conf: | [SERVICE] Flush 5 Log_Level info Daemon off Parsers_File parsers.conf HTTP_Server $${HTTP_SERVER} HTTP_Listen 0.0.0.0 HTTP_Port $${HTTP_PORT} storage.path /var/fluent-bit/state/flb-storage/ storage.sync normal storage.checksum off storage.backlog.mem_limit 5M @INCLUDE application-log.conf application-log.conf: | [INPUT] Name tail Tag karpenter.* Path /var/log/containers/*.log Exclude_Path /var/log/containers/cloudwatch-agent*, /var/log/containers/fluent-bit*, /var/log/containers/aws-node*, /var/log/containers/kube-proxy* Docker_Mode On Docker_Mode_Flush 5 Docker_Mode_Parser container_firstline Parser docker DB /var/fluent-bit/state/flb_container.db Mem_Buf_Limit 50MB Skip_Long_Lines On Refresh_Interval 10 Rotate_Wait 30 storage.type filesystem Read_from_Head $${READ_FROM_HEAD} [FILTER] Name kubernetes Match karpenter.* Kube_URL https://kubernetes.default.svc:443 Kube_Tag_Prefix karpenter.var.log.containers. Merge_Log On Merge_Log_Key log_processed K8S-Logging.Parser On K8S-Logging.Exclude Off Labels Off Annotations Off [FILTER] Name grep Match karpenter.* Regex $kubernetes['namespace_name'] karpenter [OUTPUT] Name cloudwatch_logs Match karpenter.* region $${AWS_REGION} log_group_name ${cloudwatch_group_name} log_stream_prefix $${HOST_NAME}- auto_create_group true extra_user_agent container-insights log_retention_days "3" parsers.conf: | [PARSER] Name docker Format json Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ [PARSER] Name syslog Format regex Regex ^(?<time>[^ ]* {1,2}[^ ]* [^ ]*) (?<host>[^ ]*) (?<ident>[a-zA-Z0-9_\/\.\-]*)(?:\[(?<pid>[0-9]+)\])?(?:[^\:]*\:)? *(?<message>.*)$ Time_Key time Time_Format %b %d %H:%M:%S [PARSER] Name container_firstline Format regex Regex (?<log>(?<="log":")\S(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ [PARSER] Name cwagent_firstline Format regex Regex (?<log>(?<="log":")\d{4}[\/-]\d{1,2}[\/-]\d{1,2}[ T]\d{2}:\d{2}:\d{2}(?!\.).*?)(?<!\\)".*(?<stream>(?<="stream":").*?)".*(?<time>\d{4}-\d{1,2}-\d{1,2}T\d{2}:\d{2}:\d{2}\.\w*).*(?=}) Time_Key time Time_Format %Y-%m-%dT%H:%M:%S.%LZ |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/daemonset.yaml.tftpl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
--- apiVersion: apps/v1 kind: DaemonSet metadata: name: fluent-bit namespace: ${namespace} labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: selector: matchLabels: k8s-app: fluent-bit template: metadata: labels: k8s-app: fluent-bit version: v1 kubernetes.io/cluster-service: "true" spec: containers: - name: fluent-bit image: amazon/aws-for-fluent-bit:2.32.2.20240627 imagePullPolicy: Always securityContext: allowPrivilegeEscalation: false env: - name: AWS_REGION valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: logs.region - name: CLUSTER_NAME valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: cluster.name - name: HTTP_SERVER valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: http.server - name: HTTP_PORT valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: http.port - name: READ_FROM_HEAD valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: read.head - name: READ_FROM_TAIL valueFrom: configMapKeyRef: name: fluent-bit-cluster-info key: read.tail - name: HOST_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - name: CI_VERSION value: "k8s/1.3.9" resources: limits: memory: 200Mi cpu: 100m requests: cpu: 50m memory: 100Mi volumeMounts: # Please don't change below read-only permissions - name: fluentbitstate mountPath: /var/fluent-bit/state - name: varlog mountPath: /var/log readOnly: true - name: varlibdockercontainers mountPath: /var/lib/docker/containers readOnly: true - name: fluent-bit-config mountPath: /fluent-bit/etc/ - name: runlogjournal mountPath: /run/log/journal readOnly: true - name: dmesg mountPath: /var/log/dmesg readOnly: true terminationGracePeriodSeconds: 10 volumes: - name: fluentbitstate hostPath: path: /var/fluent-bit/state - name: varlog hostPath: path: /var/log - name: varlibdockercontainers hostPath: path: /var/lib/docker/containers - name: fluent-bit-config configMap: name: fluent-bit-config - name: runlogjournal hostPath: path: /run/log/journal - name: dmesg hostPath: path: /var/log/dmesg serviceAccountName: fluent-bit nodeSelector: kubernetes.io/os: linux dedication: "infra" provisioning: "nodegroup" tolerations: - key: "dedicated" operator: "Equal" value: "infra" effect: "NoSchedule" |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/gen-configmap.yaml.tftpl
1 2 3 4 5 6 7 |
cluster.name: ${cluster_name} http.port: ${http_port} http.server: ${http_server} logs.region: ${logs_region} read.head: ${read_head} read.tail: ${read_tail} |
/TEST/infra/terraform-module/cloudwatch-karpenter-log/templates/serviceaccount.yaml.tftpl
1 2 3 4 5 6 7 8 9 |
--- apiVersion: v1 kind: ServiceAccount metadata: name: fluent-bit namespace: ${namespace} annotations: eks.amazonaws.com/role-arn: ${role_arn} |
/TEST/infra/infrastructure/aws/test-dev/main.tf
1 2 3 4 |
locals { vars = file("variables/test-dev.yaml") settings = yamldecode(local.vars) } |
тут задаём чать переменных:
/TEST/infra/infrastructure/aws/test-dev/variables/test-dev.yaml
1 2 3 4 5 6 |
eks: eks_cluster_name: "test-dev" eks_cluster_version: "1.29" eks_services_cidr: "172.19.0.0/16" |
а вот так мы этот модуль запускаем
/TEST/infra/infrastructure/aws/test-dev/aws-eks-log-cloudwatch.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
module "cloudwatch_karpenter_log" { source = "../../../terraform-module/cloudwatch-karpenter-log" cluster_name = local.settings.eks.eks_cluster_name http_port = "2020" http_server = "On" logs_region = local.region account_id = data.aws_caller_identity.current.id read_head = "On" read_tail = "On" role_arn = module.aws_iam_cloudwatch_log.role_arn namespace = "cloudwatch-logs" cloudwatch_group_name = "/aws/${local.settings.eks.eks_cluster_name}/karpenter-log" lambda_bucket = "test-dev-lambda-functions" script_s3_path = "karpenter-alert" secret_name = "dev/infra/alerts-slack-webhooks" layer_name = "notifications" #(lambda layer already exist in module notifications) function_name = "cloudwatch-karpenter-log-checker" } |