Terraform module that will deploy kube-state-metrics on kubernetes.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

918 lines
34KB

  1. #####
  2. # Locals
  3. #####
  4. locals {
  5. labels = {
  6. "app.kubernetes.io/version" = var.image_version
  7. "app.kubernetes.io/component" = "exporter"
  8. "app.kubernetes.io/part-of" = "monitoring"
  9. "app.kubernetes.io/managed-by" = "terraform"
  10. "app.kubernetes.io/name" = "kube-state-metrics"
  11. }
  12. port = 8080
  13. service_port = 80
  14. service_port_name = "http"
  15. prometheus_alert_groups_rules_labels = merge(
  16. {
  17. "source" = "https://scm.dazzlingwrench.fxinnovation.com/fxinnovation-public/terraform-module-kubernetes-kube-state-metrics"
  18. },
  19. var.prometheus_alert_groups_rules_labels
  20. )
  21. prometheus_alert_groups_rules_annotations = merge(
  22. {},
  23. var.prometheus_alert_groups_rules_annotations
  24. )
  25. prometheus_alert_groups = [
  26. {
  27. "name" = "kube-state-metrics"
  28. "rules" = [
  29. {
  30. "alert" = "kube-state-metrics - deployment availability warning"
  31. "expr" = "(kube_deployment_status_replicas_available / kube_deployment_spec_replicas) * 100 < 100"
  32. "for" = "15m"
  33. "labels" = merge(
  34. {
  35. "severity" = "warning"
  36. "urgency" = "3"
  37. },
  38. local.prometheus_alert_groups_rules_labels
  39. )
  40. "annotations" = merge(
  41. {
  42. "summary" = "kube-state-metrics - Deployment replica availability warning on {{ $labels.namespace }}-{{ $labels.deployment }}"
  43. "description" = "kube-state-metrics:\nDeployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has had less then 100% available replica's for 15min.\nValue: {{ $value }}%\nLabels: {{ $labels }}"
  44. },
  45. local.prometheus_alert_groups_rules_annotations
  46. )
  47. },
  48. {
  49. "alert" = "kube-state-metrics - deployment availability critical"
  50. "expr" = "(kube_deployment_status_replicas_available / kube_deployment_spec_replicas) * 100 < 50"
  51. "for" = "15m"
  52. "labels" = merge(
  53. {
  54. "severity" = "critical"
  55. "urgency" = "2"
  56. },
  57. local.prometheus_alert_groups_rules_labels
  58. )
  59. "annotations" = merge(
  60. {
  61. "summary" = "kube-state-metrics - Deployment replica availability critical on {{ $labels.namespace }}-{{ $labels.deployment }}"
  62. "description" = "kube-state-metrics:\nDeployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has had less then 50% available replica's for 15min.\nValue: {{ $value }}%\nLabels: {{ $labels }}"
  63. },
  64. local.prometheus_alert_groups_rules_annotations
  65. )
  66. },
  67. {
  68. "alert" = "kube-state-metrics - deployment availability down"
  69. "expr" = "(kube_deployment_status_replicas_available / kube_deployment_spec_replicas) * 100 == 0 and kube_deployment_spec_replicas > 1"
  70. "for" = "1m"
  71. "labels" = merge(
  72. {
  73. "severity" = "critical"
  74. "urgency" = "2"
  75. },
  76. local.prometheus_alert_groups_rules_labels
  77. )
  78. "annotations" = merge(
  79. {
  80. "summary" = "kube-state-metrics - Deployment down on {{ $labels.namespace }}-{{ $labels.deployment }}"
  81. "description" = "kube-state-metrics:\nDeployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has had 0% available replica's for 1min.\nLabels: {{ $labels }}"
  82. },
  83. local.prometheus_alert_groups_rules_annotations
  84. )
  85. },
  86. {
  87. "alert" = "kube-state-metrics - deployment availability down single replica"
  88. "expr" = "(kube_deployment_status_replicas_available / kube_deployment_spec_replicas) * 100 == 0 and kube_deployment_spec_replicas == 1"
  89. "for" = "15m"
  90. "labels" = merge(
  91. {
  92. "severity" = "critical"
  93. "urgency" = "2"
  94. },
  95. local.prometheus_alert_groups_rules_labels
  96. )
  97. "annotations" = merge(
  98. {
  99. "summary" = "kube-state-metrics - Deployment down on {{ $labels.namespace }}-{{ $labels.deployment }}"
  100. "description" = "kube-state-metrics:\nDeployment {{ $labels.deployment }} in namespace {{ $labels.namespace }} has had 0% available replica's for 15min.\nLabels: {{ $labels }}"
  101. },
  102. local.prometheus_alert_groups_rules_annotations
  103. )
  104. },
  105. {
  106. "alert" = "kube-state-metrics - statefulset availability warning"
  107. "expr" = "(kube_statefulset_status_replicas_ready / kube_statefulset_replicas) * 100 < 100"
  108. "for" = "15m"
  109. "labels" = merge(
  110. {
  111. "severity" = "warning"
  112. "urgency" = "3"
  113. },
  114. local.prometheus_alert_groups_rules_labels
  115. )
  116. "annotations" = merge(
  117. {
  118. "summary" = "kube-state-metrics - Statefulset replica availability warning on {{ $labels.namespace }}-{{ $labels.statefulset }}"
  119. "description" = "kube-state-metrics:\nStatefulset {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has had less then 100% available replica's for 15min.\nValue: {{ $value }}%\nLabels: {{ $labels }}"
  120. },
  121. local.prometheus_alert_groups_rules_annotations
  122. )
  123. },
  124. {
  125. "alert" = "kube-state-metrics - statefulset availability critical"
  126. "expr" = "(kube_statefulset_status_replicas_ready / kube_statefulset_replicas) * 100 < 50"
  127. "for" = "15m"
  128. "labels" = merge(
  129. {
  130. "severity" = "critical"
  131. "urgency" = "2"
  132. },
  133. local.prometheus_alert_groups_rules_labels
  134. )
  135. "annotations" = merge(
  136. {
  137. "summary" = "kube-state-metrics - Statefulset replica availability critical on {{ $labels.namespace }}-{{ $labels.statefulset }}"
  138. "description" = "kube-state-metrics:\nStatefulset {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has had less then 50% available replica's for 15min.\nValue: {{ $value }}%\nLabels: {{ $labels }}"
  139. },
  140. local.prometheus_alert_groups_rules_annotations
  141. )
  142. },
  143. {
  144. "alert" = "kube-state-metrics - statefulset availability down"
  145. "expr" = "(kube_statefulset_status_replicas_ready / kube_statefulset_replicas) * 100 == 0 and kube_statefulset_replicas > 1"
  146. "for" = "1m"
  147. "labels" = merge(
  148. {
  149. "severity" = "critical"
  150. "urgency" = "2"
  151. },
  152. local.prometheus_alert_groups_rules_labels
  153. )
  154. "annotations" = merge(
  155. {
  156. "summary" = "kube-state-metrics - Statefulset down on {{ $labels.namespace }}-{{ $labels.statefulset }}"
  157. "description" = "kube-state-metrics:\nStatefulset {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has had 0% available replica's for 1min.\nLabels: {{ $labels }}"
  158. },
  159. local.prometheus_alert_groups_rules_annotations
  160. )
  161. },
  162. {
  163. "alert" = "kube-state-metrics - statefulset availability down single replica"
  164. "expr" = "(kube_statefulset_status_replicas_ready / kube_statefulset_replicas) * 100 == 0 and kube_statefulset_spec_replicas == 1"
  165. "for" = "15m"
  166. "labels" = merge(
  167. {
  168. "severity" = "critical"
  169. "urgency" = "2"
  170. },
  171. local.prometheus_alert_groups_rules_labels
  172. )
  173. "annotations" = merge(
  174. {
  175. "summary" = "kube-state-metrics - Statefulset down on {{ $labels.namespace }}-{{ $labels.statefulset }}"
  176. "description" = "kube-state-metrics:\nStatefulset {{ $labels.statefulset }} in namespace {{ $labels.namespace }} has had 0% available replica's for 15min.\nLabels: {{ $labels }}"
  177. },
  178. local.prometheus_alert_groups_rules_annotations
  179. )
  180. },
  181. {
  182. "alert" = "kube-state-metrics - daemonset availability critical"
  183. "expr" = "(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled) * 100 < 100"
  184. "for" = "5m"
  185. "labels" = merge(
  186. {
  187. "severity" = "critical"
  188. "urgency" = "2"
  189. },
  190. local.prometheus_alert_groups_rules_labels
  191. )
  192. "annotations" = merge(
  193. {
  194. "summary" = "kube-state-metrics - Daemonset Replica Availability Critical on {{ $labels.namespace }}-{{ $labels.daemonset }}"
  195. "description" = "kube-state-metrics:\nDaemonset {{ $labels.daemonset }} in namespace {{ $labels.namespace }} has had less then 100% available replica's for 5min.\nValue: {{ $value }}%\nLabels: {{ $labels }}"
  196. },
  197. local.prometheus_alert_groups_rules_annotations
  198. )
  199. },
  200. {
  201. "alert" = "kube-state-metrics - pod container oom killed"
  202. "expr" = "kube_pod_container_status_terminated_reason{reason=\"OOMKilled\"} > 0"
  203. "for" = "1m"
  204. "labels" = merge(
  205. {
  206. "severity" = "warning"
  207. "urgency" = "3"
  208. },
  209. local.prometheus_alert_groups_rules_labels
  210. )
  211. "annotations" = merge(
  212. {
  213. "summary" = "kube-state-metrics - Container got {{ $labels.reason }} for {{ $labels.namespace }}-{{ $labels.pod }}-{{ $labels.container }}"
  214. "description" = "kube-state-metrics:\nContainer {{ $labels.container }} got {{ $labels.reason }} on pod {{ $labels.pod }} in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  215. },
  216. local.prometheus_alert_groups_rules_annotations
  217. )
  218. },
  219. {
  220. "alert" = "kube-state-metrics - pod container cannot run"
  221. "expr" = "kube_pod_container_status_terminated_reason{reason=\"ContainerCannotRun\"} > 0"
  222. "for" = "1m"
  223. "labels" = merge(
  224. {
  225. "severity" = "warning"
  226. "urgency" = "3"
  227. },
  228. local.prometheus_alert_groups_rules_labels
  229. )
  230. "annotations" = merge(
  231. {
  232. "summary" = "kube-state-metrics - Container teminated with {{ $labels.reason }} reason for {{ $labels.namespace }}-{{ $labels.pod }}-{{ $labels.container }}"
  233. "description" = "kube-state-metrics:\nContainer {{ $labels.container }} teminated with {{ $labels.reason }} Run on pod {{ $labels.pod }} in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  234. },
  235. local.prometheus_alert_groups_rules_annotations
  236. )
  237. },
  238. {
  239. "alert" = "kube-state-metrics - pod container crashloop backoff"
  240. "expr" = "kube_pod_container_status_waiting_reason{reason=\"CrashLoopBackOff\"} > 0"
  241. "for" = "5m"
  242. "labels" = merge(
  243. {
  244. "severity" = "critical"
  245. "urgency" = "2"
  246. },
  247. local.prometheus_alert_groups_rules_labels
  248. )
  249. "annotations" = merge(
  250. {
  251. "summary" = "kube-state-metrics - Container {{ $labels.namespace }}-{{ $labels.pod }}-{{ $labels.container }} is waiting creation with {{ $labels.reason }} reason."
  252. "description" = "kube-state-metrics:\nContainer {{ $labels.container }} is waiting to be created with {{ $labels.reason }} reason on pod {{ $labels.pod }} in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  253. },
  254. local.prometheus_alert_groups_rules_annotations
  255. )
  256. },
  257. {
  258. "alert" = "kube-state-metrics - pod container config creation error"
  259. "expr" = "kube_pod_container_status_waiting_reason{reason=\"CreateContainerConfigError\"} > 0"
  260. "for" = "5m"
  261. "labels" = merge(
  262. {
  263. "severity" = "critical"
  264. "urgency" = "2"
  265. },
  266. local.prometheus_alert_groups_rules_labels
  267. )
  268. "annotations" = merge(
  269. {
  270. "summary" = "kube-state-metrics - Container {{ $labels.namespace }}-{{ $labels.pod }}-{{ $labels.container }} is waiting creation with {{ $labels.reason }} reason."
  271. "description" = "kube-state-metrics:\nContainer {{ $labels.container }} is waiting to be created with {{ $labels.reason }} reason on pod {{ $labels.pod }} in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  272. },
  273. local.prometheus_alert_groups_rules_annotations
  274. )
  275. },
  276. {
  277. "alert" = "kube-state-metrics - pod container image pull backoff"
  278. "expr" = "kube_pod_container_status_waiting_reason{reason=\"ImagePullBackOff\"} > 0"
  279. "for" = "5m"
  280. "labels" = merge(
  281. {
  282. "severity" = "critical"
  283. "urgency" = "2"
  284. },
  285. local.prometheus_alert_groups_rules_labels
  286. )
  287. "annotations" = merge(
  288. {
  289. "summary" = "kube-state-metrics - Container {{ $labels.namespace }}-{{ $labels.pod }}-{{ $labels.container }} is waiting creation with {{ $labels.reason }} reason."
  290. "description" = "kube-state-metrics:\nContainer {{ $labels.container }} is waiting to be created with {{ $labels.reason }} reason on pod {{ $labels.pod }} in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  291. },
  292. local.prometheus_alert_groups_rules_annotations
  293. )
  294. },
  295. {
  296. "alert" = "kube-state-metrics - pod failed phase"
  297. "expr" = "kube_pod_status_phase{phase=\"Failed\"} > 0"
  298. "for" = "15m"
  299. "labels" = merge(
  300. {
  301. "severity" = "warning"
  302. "urgency" = "3"
  303. },
  304. local.prometheus_alert_groups_rules_labels
  305. )
  306. "annotations" = merge(
  307. {
  308. "summary" = "kube-state-metrics - Pod {{ $labels.namespace }}-{{ $labels.pod }} has a phase of Failed."
  309. "description" = "kube-state-metrics:\nPod {{ $labels.pod }} has been in phase Failed for 15m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  310. },
  311. local.prometheus_alert_groups_rules_annotations
  312. )
  313. },
  314. {
  315. "alert" = "kube-state-metrics - pod unknown phase"
  316. "expr" = "kube_pod_status_phase{phase=\"Unknown\"} > 0"
  317. "for" = "15m"
  318. "labels" = merge(
  319. {
  320. "severity" = "warning"
  321. "urgency" = "3"
  322. },
  323. local.prometheus_alert_groups_rules_labels
  324. )
  325. "annotations" = merge(
  326. {
  327. "summary" = "kube-state-metrics - Pod {{ $labels.namespace }}-{{ $labels.pod }} has a phase of Unknown."
  328. "description" = "kube-state-metrics:\nPod {{ $labels.pod }} has been in phase Unknown for 15m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  329. },
  330. local.prometheus_alert_groups_rules_annotations
  331. )
  332. },
  333. {
  334. "alert" = "kube-state-metrics - persistent volume failed phase"
  335. "expr" = "kube_persistentvolume_status_phase{phase=\"Failed\"} > 0"
  336. "for" = "5m"
  337. "labels" = merge(
  338. {
  339. "severity" = "critical"
  340. "urgency" = "2"
  341. },
  342. local.prometheus_alert_groups_rules_labels
  343. )
  344. "annotations" = merge(
  345. {
  346. "summary" = "kube-state-metrics - Persistent Volume {{ $labels.namespace }}-{{ $labels.persistentvolume }} has a phase of Failed."
  347. "description" = "kube-state-metrics:\nPersistent Volume {{ $labels.persistentvolume }} has been in phase Failed for 5m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  348. },
  349. local.prometheus_alert_groups_rules_annotations
  350. )
  351. },
  352. {
  353. "alert" = "kube-state-metrics - persistent volume pending phase"
  354. "expr" = "kube_persistentvolume_status_phase{phase=\"Pending\"} > 0"
  355. "for" = "15m"
  356. "labels" = merge(
  357. {
  358. "severity" = "warning"
  359. "urgency" = "3"
  360. },
  361. local.prometheus_alert_groups_rules_labels
  362. )
  363. "annotations" = merge(
  364. {
  365. "summary" = "kube-state-metrics - Persistent Volume {{ $labels.namespace }}-{{ $labels.persistentvolume }} has a phase of Pending."
  366. "description" = "kube-state-metrics:\nPersistent Volume {{ $labels.persistentvolume }} has been in phase Pending for 15m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  367. },
  368. local.prometheus_alert_groups_rules_annotations
  369. )
  370. },
  371. {
  372. "alert" = "kube-state-metrics - persistent volume claim pending phase"
  373. "expr" = "kube_persistentvolumeclaim_status_phase{phase=\"Pending\"} > 0"
  374. "for" = "15m"
  375. "labels" = merge(
  376. {
  377. "severity" = "warning"
  378. "urgency" = "3"
  379. },
  380. local.prometheus_alert_groups_rules_labels
  381. )
  382. "annotations" = merge(
  383. {
  384. "summary" = "kube-state-metrics - Persistent Volume Claim {{ $labels.namespace }}-{{ $labels.persistentvolumeclaim }} has a phase of Pending."
  385. "description" = "kube-state-metrics:\nPersistent Volume Claim {{ $labels.persistentvolumeclaim }} has been in phase Pending for 15m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  386. },
  387. local.prometheus_alert_groups_rules_annotations
  388. )
  389. },
  390. {
  391. "alert" = "kube-state-metrics - persistent volume claim lost phase"
  392. "expr" = "kube_persistentvolumeclaim_status_phase{phase=\"Lost\"} > 0"
  393. "for" = "5m"
  394. "labels" = merge(
  395. {
  396. "severity" = "critical"
  397. "urgency" = "2"
  398. },
  399. local.prometheus_alert_groups_rules_labels
  400. )
  401. "annotations" = merge(
  402. {
  403. "summary" = "kube-state-metrics - Persistent Volume Claim {{ $labels.namespace }}-{{ $labels.persistentvolumeclaim }} has a phase of Lost."
  404. "description" = "kube-state-metrics:\nPersistent Volume Claim {{ $labels.persistentvolumeclaim }} has been in phase Lost for 5m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  405. },
  406. local.prometheus_alert_groups_rules_annotations
  407. )
  408. },
  409. {
  410. "alert" = "kube-state-metrics - endpoint address not ready"
  411. "expr" = "kube_endpoint_address_not_ready > 0"
  412. "for" = "5m"
  413. "labels" = merge(
  414. {
  415. "severity" = "critical"
  416. "urgency" = "2"
  417. },
  418. local.prometheus_alert_groups_rules_labels
  419. )
  420. "annotations" = merge(
  421. {
  422. "summary" = "kube-state-metrics - Endpoint Address {{ $labels.namespace }}-{{ $labels.endpoint }} is not in a Ready state."
  423. "description" = "kube-state-metrics:\nEndpoint Address {{ $labels.endpoint }} has not been Ready for 5m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  424. },
  425. local.prometheus_alert_groups_rules_annotations
  426. )
  427. },
  428. {
  429. "alert" = "kube-state-metrics - service load balancer ingress down"
  430. "expr" = "kube_service_status_load_balancer_ingress < 1"
  431. "for" = "5m"
  432. "labels" = merge(
  433. {
  434. "severity" = "critical"
  435. "urgency" = "2"
  436. },
  437. local.prometheus_alert_groups_rules_labels
  438. )
  439. "annotations" = merge(
  440. {
  441. "summary" = "kube-state-metrics - Service Load Balancer Ingress {{ $labels.namespace }}-{{ $labels.service }} is down."
  442. "description" = "kube-state-metrics:\nService Load Balancer Ingress {{ $labels.service }} has not been down for 5m in namespace {{ $labels.namespace }}\nLabels:\n{{ $labels }}"
  443. },
  444. local.prometheus_alert_groups_rules_annotations
  445. )
  446. },
  447. {
  448. "alert" = "kube-state-metrics - node condition unkown"
  449. "expr" = "kube_node_status_condition{status=\"unknown\"} > 0"
  450. "for" = "5m"
  451. "labels" = merge(
  452. {
  453. "severity" = "critical"
  454. "urgency" = "2"
  455. },
  456. local.prometheus_alert_groups_rules_labels
  457. )
  458. "annotations" = merge(
  459. {
  460. "summary" = "kube-state-metrics - Condition {{ $labels.condition }} on node {{ $labels.node }} is unknown."
  461. "description" = "kube-state-metrics:\nCondition {{ $labels.condition }} on node {{ $labels.node }} has been unknown for 5m.\nLabels:\n{{ $labels }}"
  462. },
  463. local.prometheus_alert_groups_rules_annotations
  464. )
  465. },
  466. {
  467. "alert" = "kube-state-metrics - node condition true"
  468. "expr" = "kube_node_status_condition{condition!=\"Ready\",status=\"true\"} > 0"
  469. "for" = "5m"
  470. "labels" = merge(
  471. {
  472. "severity" = "critical"
  473. "urgency" = "2"
  474. },
  475. local.prometheus_alert_groups_rules_labels
  476. )
  477. "annotations" = merge(
  478. {
  479. "summary" = "kube-state-metrics - Condition {{ $labels.condition }} on node {{ $labels.node }} is failing."
  480. "description" = "kube-state-metrics:\nCondition {{ $labels.condition }} on node {{ $labels.node }} has been failing for 5m.\nLabels:\n{{ $labels }}"
  481. },
  482. local.prometheus_alert_groups_rules_annotations
  483. )
  484. },
  485. {
  486. "alert" = "kube-state-metrics - node condition false"
  487. "expr" = "kube_node_status_condition{condition=\"Ready\",status=\"false\"} > 0"
  488. "for" = "5m"
  489. "labels" = merge(
  490. {
  491. "severity" = "critical"
  492. "urgency" = "2"
  493. },
  494. local.prometheus_alert_groups_rules_labels
  495. )
  496. "annotations" = merge(
  497. {
  498. "summary" = "kube-state-metrics - Condition {{ $labels.condition }} on node {{ $labels.node }} is failing."
  499. "description" = "kube-state-metrics:\nCondition {{ $labels.condition }} on node {{ $labels.node }} has been failing for 5m.\nLabels:\n{{ $labels }}"
  500. },
  501. local.prometheus_alert_groups_rules_annotations
  502. )
  503. },
  504. {
  505. "alert" = "kube-state-metrics - node memory limits warning"
  506. "expr" = "sum(kube_pod_container_resource_limits_memory_bytes) by (instance, node) * 100 / sum(kube_node_status_allocatable_memory_bytes) by (instance, node) > 200"
  507. "for" = "15m"
  508. "labels" = merge(
  509. {
  510. "severity" = "warning"
  511. "urgency" = "3"
  512. },
  513. local.prometheus_alert_groups_rules_labels
  514. )
  515. "annotations" = merge(
  516. {
  517. "summary" = "kube-state-metrics - Node {{ $labels.node }} on cluster {{ $labels.instance }} is under-provisionned in memory"
  518. "description" = "kube-state-metrics:\nNode {{ $labels.node }} on cluster {{ $labels.instance }} is under-provisionned in memory.\nDescription:\nPods on that node are allowed to use up to {{ $value }}% of the allocatable memory on the node.\nLabels:\n{{ $labels }}"
  519. },
  520. local.prometheus_alert_groups_rules_annotations
  521. )
  522. },
  523. {
  524. "alert" = "kube-state-metrics - node memory requests warning"
  525. "expr" = "sum(kube_pod_container_resource_requests_memory_bytes) by (instance, node) * 100 / sum(kube_node_status_allocatable_memory_bytes) by (instance, node) > 100"
  526. "for" = "15m"
  527. "labels" = merge(
  528. {
  529. "severity" = "warning"
  530. "urgency" = "3"
  531. },
  532. local.prometheus_alert_groups_rules_labels
  533. )
  534. "annotations" = merge(
  535. {
  536. "summary" = "kube-state-metrics - Node {{ $labels.node }} on cluster {{ $labels.instance }} is dangerously under-provisionned in memory"
  537. "description" = "kube-state-metrics:\n\nNode {{ $labels.node }} on cluster {{ $labels.instance }} is dangerously under-provisionned in memory.\nDescription:\nPods on the node are requesting {{ $value }}% of the allocatable memory of the node.\nLabels:\n{{ $labels }}"
  538. },
  539. local.prometheus_alert_groups_rules_annotations
  540. )
  541. },
  542. {
  543. "alert" = "kube-state-metrics - cluster memory limits warning"
  544. "expr" = "sum(kube_pod_container_resource_limits_memory_bytes) by (instance) * 100 / sum(kube_node_status_allocatable_memory_bytes) by (instance) > 200"
  545. "for" = "15m"
  546. "labels" = merge(
  547. {
  548. "severity" = "warning"
  549. "urgency" = "3"
  550. },
  551. local.prometheus_alert_groups_rules_labels
  552. )
  553. "annotations" = merge(
  554. {
  555. "summary" = "kube-state-metrics - Cluster {{ $labels.instance }} is under-provisionned in memory"
  556. "description" = "kube-state-metrics:\n\nCluster {{ $labels.instance }} is under-provisionned in memory.\nDescription:\nPods on the cluster are currently allowed to use up to {{ $value }}% of the allocatable memory of the cluster.\nLabels:\n{{ $labels }}"
  557. },
  558. local.prometheus_alert_groups_rules_annotations
  559. )
  560. },
  561. {
  562. "alert" = "kube-state-metrics - cluster memory requests warning"
  563. "expr" = "sum(kube_pod_container_resource_requests_memory_bytes) by (instance) * 100 / sum(kube_node_status_allocatable_memory_bytes) by (instance) > 100"
  564. "for" = "15m"
  565. "labels" = merge(
  566. {
  567. "severity" = "warning"
  568. "urgency" = "3"
  569. },
  570. local.prometheus_alert_groups_rules_labels
  571. )
  572. "annotations" = merge(
  573. {
  574. "summary" = "kube-state-metrics - Cluster {{ $labels.instance }} is dangerously under-provisionned in memory"
  575. "description" = "kube-state-metrics:\n\nCluster {{ $labels.instance }} is dangerously under-provisionned in memory.\nDescription:\nPods on the cluster are requesting {{ $value }}% of the allocatable memory of the cluster.\nLabels:\n{{ $labels }}"
  576. },
  577. local.prometheus_alert_groups_rules_annotations
  578. )
  579. }
  580. ]
  581. }
  582. ]
  583. }
  584. #####
  585. # Randoms
  586. #####
  587. resource "random_string" "selector" {
  588. special = false
  589. upper = false
  590. number = false
  591. length = 8
  592. }
  593. #####
  594. # Deployment
  595. #####
  596. resource "kubernetes_deployment" "this" {
  597. count = var.enabled ? 1 : 0
  598. metadata {
  599. name = var.deployment_name
  600. namespace = var.namespace
  601. annotations = merge(
  602. var.annotations,
  603. var.deployment_annotations
  604. )
  605. labels = merge(
  606. {
  607. "app.kubernetes.io/instance" = var.deployment_name
  608. },
  609. local.labels,
  610. var.labels,
  611. var.deployment_labels
  612. )
  613. }
  614. spec {
  615. replicas = 1
  616. selector {
  617. match_labels = {
  618. app = "kube-state-metrics"
  619. random = random_string.selector.result
  620. }
  621. }
  622. template {
  623. metadata {
  624. annotations = merge(
  625. var.annotations,
  626. var.deployment_annotations
  627. )
  628. labels = merge(
  629. {
  630. "app.kubernetes.io/instance" = var.deployment_name
  631. app = "kube-state-metrics"
  632. random = random_string.selector.result
  633. },
  634. local.labels,
  635. var.labels,
  636. var.deployment_labels
  637. )
  638. }
  639. spec {
  640. automount_service_account_token = true
  641. service_account_name = var.rbac_enabled ? element(concat(kubernetes_service_account.this.*.metadata.0.name, list("")), 0) : var.deployment_service_account_name
  642. container {
  643. name = "kube-state-metrics"
  644. image = "${var.image_name}:${var.image_version}"
  645. image_pull_policy = var.image_pull_policy
  646. readiness_probe {
  647. http_get {
  648. path = "/"
  649. port = local.port
  650. scheme = "HTTP"
  651. }
  652. timeout_seconds = 5
  653. period_seconds = 5
  654. success_threshold = 1
  655. failure_threshold = 35
  656. }
  657. liveness_probe {
  658. http_get {
  659. path = "/"
  660. port = local.port
  661. scheme = "HTTP"
  662. }
  663. timeout_seconds = 5
  664. period_seconds = 10
  665. success_threshold = 1
  666. failure_threshold = 3
  667. }
  668. port {
  669. name = "metrics"
  670. container_port = local.port
  671. protocol = "TCP"
  672. }
  673. resources {
  674. requests {
  675. memory = "16Mi"
  676. cpu = "10m"
  677. }
  678. limits {
  679. memory = "128Mi"
  680. cpu = "50m"
  681. }
  682. }
  683. }
  684. }
  685. }
  686. }
  687. }
  688. #####
  689. # Service
  690. #####
  691. resource "kubernetes_service" "this" {
  692. count = var.enabled ? 1 : 0
  693. metadata {
  694. name = var.service_name
  695. namespace = var.namespace
  696. annotations = merge(
  697. {
  698. "prometheus.io/scrape" = "true"
  699. },
  700. var.annotations,
  701. var.service_annotations
  702. )
  703. labels = merge(
  704. {
  705. "app.kubernetes.io/instance" = var.service_name
  706. },
  707. local.labels,
  708. var.labels,
  709. var.service_labels
  710. )
  711. }
  712. spec {
  713. selector = {
  714. random = random_string.selector.result
  715. app = "kube-state-metrics"
  716. }
  717. type = "ClusterIP"
  718. port {
  719. port = local.service_port
  720. target_port = "metrics"
  721. protocol = "TCP"
  722. name = local.service_port_name
  723. }
  724. }
  725. }
  726. #####
  727. # Service Account
  728. #####
  729. resource "kubernetes_service_account" "this" {
  730. count = var.enabled && var.rbac_enabled ? 1 : 0
  731. automount_service_account_token = var.service_account_automount_service_account_token
  732. metadata {
  733. name = var.service_account_name
  734. namespace = var.namespace
  735. annotations = merge(
  736. var.annotations,
  737. var.service_account_annotations
  738. )
  739. labels = merge(
  740. {
  741. "app.kubernetes.io/instance" = var.service_account_name
  742. },
  743. local.labels,
  744. var.labels,
  745. var.service_account_labels
  746. )
  747. }
  748. }
  749. #####
  750. # Cluster Role
  751. #####
  752. resource "kubernetes_cluster_role" "this" {
  753. count = var.enabled && var.rbac_enabled ? 1 : 0
  754. metadata {
  755. name = var.cluster_role_name
  756. annotations = merge(
  757. var.annotations,
  758. var.cluster_role_annotations
  759. )
  760. labels = merge(
  761. {
  762. "app.kubernetes.io/instance" = var.cluster_role_name
  763. },
  764. local.labels,
  765. var.labels,
  766. var.cluster_role_labels
  767. )
  768. }
  769. rule {
  770. api_groups = [""]
  771. resources = [
  772. "namespaces",
  773. "nodes",
  774. "persistentvolumeclaims",
  775. "pods",
  776. "services",
  777. "resourcequotas",
  778. "replicationcontrollers",
  779. "limitranges",
  780. "persistentvolumeclaims",
  781. "persistentvolumes",
  782. "endpoints",
  783. "secrets",
  784. "configmaps",
  785. ]
  786. verbs = ["list", "watch"]
  787. }
  788. rule {
  789. api_groups = ["extensions"]
  790. resources = ["daemonsets", "deployments", "ingresses", "replicasets"]
  791. verbs = ["list", "watch"]
  792. }
  793. rule {
  794. api_groups = ["apps"]
  795. resources = ["daemonsets", "deployments", "statefulsets", "replicasets"]
  796. verbs = ["get", "list", "watch"]
  797. }
  798. rule {
  799. api_groups = ["batch"]
  800. resources = ["conjobs", "job"]
  801. verbs = ["list", "watch"]
  802. }
  803. rule {
  804. api_groups = ["autoscaling"]
  805. resources = ["horizontalpodautoscalers"]
  806. verbs = ["list", "watch"]
  807. }
  808. rule {
  809. api_groups = ["authorization.k8s.io"]
  810. resources = ["subjectaccessreviews"]
  811. verbs = ["create"]
  812. }
  813. rule {
  814. api_groups = ["policy"]
  815. resources = ["poddisruptionbudgets"]
  816. verbs = ["list", "watch"]
  817. }
  818. rule {
  819. api_groups = ["certificates.k8s.io"]
  820. resources = ["certificatesigningrequests"]
  821. verbs = ["list", "watch"]
  822. }
  823. rule {
  824. api_groups = ["networking.k8s.io"]
  825. resources = ["networkpolicies", "ingresses"]
  826. verbs = ["list", "watch"]
  827. }
  828. rule {
  829. api_groups = ["coordination.k8s.io"]
  830. resources = ["leases"]
  831. verbs = ["list", "watch"]
  832. }
  833. rule {
  834. api_groups = ["storage.k8s.io"]
  835. resources = ["storageclasses", "volumeattachments"]
  836. verbs = ["list", "watch"]
  837. }
  838. rule {
  839. api_groups = ["admissionregistration.k8s.io"]
  840. resources = ["mutatingwebhookconfigurations", "validatingwebhookconfigurations"]
  841. verbs = ["list", "watch"]
  842. }
  843. }
  844. #####
  845. # Cluster Rolebinding
  846. #####
  847. resource "kubernetes_cluster_role_binding" "this" {
  848. count = var.enabled && var.rbac_enabled ? 1 : 0
  849. metadata {
  850. name = var.cluster_role_binding_name
  851. annotations = merge(
  852. var.annotations,
  853. var.cluster_role_binding_annotations
  854. )
  855. labels = merge(
  856. {
  857. "app.kubernetes.io/instance" = var.cluster_role_binding_name
  858. },
  859. local.labels,
  860. var.labels,
  861. var.cluster_role_binding_labels
  862. )
  863. }
  864. role_ref {
  865. api_group = "rbac.authorization.k8s.io"
  866. kind = "ClusterRole"
  867. name = element(concat(kubernetes_cluster_role.this.*.metadata.0.name, list("")), 0)
  868. }
  869. subject {
  870. kind = "ServiceAccount"
  871. name = element(concat(kubernetes_service_account.this.*.metadata.0.name, list("")), 0)
  872. namespace = var.namespace
  873. }
  874. }