1.准备工作
⚠️ 安装过程中所需image需上传至私有harbor仓库。
# 由于国内网络环境问题,请直接下载prometheus chart包,并解压。
https://github.com/prometheus-community/helm-charts/#
unzip helm-charts-main.zip
# 下载opencost的scrape_job(用于opencost集成到prometheus,为prometheus提供计费数据)
# 可直接加入到prometheus的value.yaml文件当中,不需要单独用-f引用
https://raw.githubusercontent.com/opencost/opencost/develop/kubernetes/prometheus/extraScrapeConfigs.yaml
# 由于国内网络环境问题,请直接下载opencost chart包,并解压。
https://github.com/opencost/opencost-helm-chart
unzip opencost-helm-chart-main.zip
2.通过helm安装prometheus
# 配置
# 进入chart目录
cd helm-charts-main/charts/prometheus
# 修改yaml,参见4.2小节
vi values.yam
# 由于prometheus安装时缺省需要依赖,所以从上一级目录把4个依赖拷到当前chart的charts/下
cp -r ../alertmanager ../kube-state-metrics ../prometheus-node-exporter ../prometheus-pushgateway charts/
# 通过Helm安装prometheus(请在安装前修改附录部分4.2的values.yaml文件)。
helm install prometheus .
--namespace prometheus-system
--create-namespace
--set prometheus-pushgateway.enabled=false
--set alertmanager.enabled=false
--set global.imageRegistry=10.118.17.28:30002
-f values.yaml
# 检查
# 运行以下命令,检查Helm release是否成功安装:
helm list --namespace prometheus-system
# 查看Prometheus的相关Pods是否已经启动,并且没有出现错误:
kubectl get pods -n prometheus-system
# 查看Prometheus服务是否创建,并且能够正确访问:
kubectl get svc -n prometheus-system
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
prometheus-kube-state-metrics ClusterIP 10.87.112.9 <none> 8080/TCP 56m
prometheus-prometheus-node-exporter ClusterIP 10.87.156.62 <none> 9100/TCP 56m
prometheus-server ClusterIP 10.93.219.51 <none> 80/TCP 56m
# 如果你遇到任何问题,可以查看Prometheus Pods的日志,检查是否有错误或警告信息。
kubectl logs -n prometheus-system <prometheus-server-pod-name>
# 查看configmap
kubectl describe configmap prometheus-server -n prometheus-system
# 其它
# 删除
helm uninstall prometheus --namespace prometheus-system
# 重启 Prometheus server
kubectl rollout restart deployment prometheus-server -n prometheus-system
3.通过helm安装opencost
# 配置
cd opencost-helm-chart-main/charts/opencost
# 安装opencost(请在安装前修改好values.yaml文件,参看附录部分4.3)
helm install opencost . --namespace opencost
# 通过nodeport临时暴露出端口来供访问(#可以在values.yaml文件中固化)
kubectl -n opencost patch svc opencost -p '{"spec":{"type":"NodePort"}}'
# 检查
helm list --namespace opencost
kubectl get svc opencost -n opencost
kubectl get pods -n opencost
# 其它
helm upgrade opencost . --namespace opencost
helm uninstall opencost --namespace opencost
4.附录
⚠️ 1.所有没有在value.yaml文件中标注image tag的镜像,可以去对应目录下的Chart.yaml中去获取具体的image tag,也就是image的版本。
4.1 extraScrapeConfigs.yaml 文件
将此文件的内容粘贴至4.2 的values.yaml文件中的extraScrapeConfigs: 部分
extraScrapeConfigs: |
- job_name: opencost
honor_labels: true
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- opencost.opencost
type: 'A'
port: 9003
4.2 prometheus的values.yaml文件
#=======================================
# helm安装下的prometheus的values.yaml文件
#=======================================
# Note:非完整文件,主要显示标红的修改部分。
# 1.image path modify
# 2.nodePort for prometheus
# 3.add ht-explorer
... ... ... ...
configmapReload:
## URL for configmap-reload to use for reloads
##
reloadUrl: ""
## env sets environment variables to pass to the container. Can be set as name/value pairs,
## read from secrets or configmaps.
env: []
# - name: SOMEVAR
# value: somevalue
# - name: PASSWORD
# valueFrom:
# secretKeyRef:
# name: mysecret
# key: password
# optional: false
prometheus:
## If false, the configmap-reload container will not be deployed
##
enabled: true
## configmap-reload container name
##
name: configmap-reload
## configmap-reload container image
image:
repository: 10.118.17.28:30002/prometheus/prometheus-config-reloader
tag: v0.86.2
# When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value).
digest: ""
pullPolicy: IfNotPresent
## config-reloader's container port and port name for probes and metrics
containerPort: 8080
containerPortName: metrics
... ... ... ...
server:
## Prometheus server container name
##
name: server
## Prometheus server container image
##
image:
repository: 10.118.17.28:30002/prometheus/prometheus
# if not set appVersion field from Chart.yaml is used
tag: ""
# When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value).
digest: ""
pullPolicy: IfNotPresent
## Prometheus server command
##
command: []
... ... ... ...
service:
## If false, no Service will be created for the Prometheus server
##
enabled: true
annotations: {}
labels: {}
clusterIP: ""
externalIPs: []
loadBalancerIP: ""
loadBalancerSourceRanges: []
servicePort: 80
sessionAffinity: None
type: NodePort
nodePort: 30090
... ... ... ...
# i think if you not config alone ksm&node exporter,can use this service dynamic
# that's right ??need to confirm
- job_name: 'kubernetes-service-endpoints'
honor_labels: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
... ... ... ...
extraScrapeConfigs: |
# ht-exporter
- job_name: 'ht-exporter'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- prometheus-system
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: ht-exporter
- source_labels: [__meta_kubernetes_pod_ip]
target_label: __address__
replacement: $1:8002
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: node
# opencost
- job_name: opencost
honor_labels: true
scrape_interval: 1m
scrape_timeout: 10s
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- opencost.opencost
type: 'A'
port: 9003
... ... ... ...
4.3 opencost的values.yaml文件
#========================
# opencoast的values.yaml文件
#========================
# 1. 修改3处image路径
# -- Overwrite the default name of the chart
nameOverride: ""
# -- Overwrite all resources name created by the chart
fullnameOverride: ""
# -- Override the deployment namespace
namespaceOverride: ""
# -- Override the default name of cluster - Can be found in /etc/kubernetes/admin.conf: clusters -> cluster -> name
clusterName: "cluster.local"
loglevel: info
plugins:
enabled: false
install:
enabled: true
fullImageName: 10.118.17.28:30002/opencost/curl:latest
securityContext:
allowPrivilegeEscalation: false
seccompProfile:
type: RuntimeDefault
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
... ...
# 修改以什么方式暴露服务
service:
enabled: true
# -- Annotations to add to the service
annotations: {}
# -- Labels to add to the service account
labels: {}
# -- Kubernetes Service type
type: ClusterIP
# -- NodePort if service type is NodePort
nodePort: {}
# -- extra ports. Useful for sidecar pods such as oauth-proxy
extraPorts: []
# - name: oauth-proxy
# port: 8081
# targetPort: 8081
# - name: oauth-metrics
# port: 8082
# targetPort: 8082
# -- LoadBalancer Source IP CIDR if service type is LoadBalancer and cloud provider supports this
loadBalancerSourceRanges: []
... ...
opencost:
... ... ... ...
exporter:
# API_PORT for the cost-model to listen on
apiPort: 9003
# debugPort: 40000 # for development purposes (debugging with delve) and not for production.
# -- The GCP Pricing API requires a key. This is supplied just for evaluation.
cloudProviderApiKey: ""
# -- Default cluster ID to use if cluster_id is not set in Prometheus metrics.
defaultClusterId: 'default-cluster'
# -- If clusterIdConfigmap is defined, use user-generated ConfigMap with key CLUSTER_ID as default cluster ID.
# -- This overrides the above defaultClusterId. Ensure the ConfigMap exists and contains the required CLUSTER_ID key.
# clusterIdConfigmap: cluster-id-configmap
image:
# -- Exporter container image registry
registry: 10.118.17.28:30002
# -- Exporter container image name
repository: opencost/opencost
# -- Exporter container image tag
tag: "1.118.0"
# -- Exporter container image pull policy
pullPolicy: IfNotPresent
# -- Override the full image name for development purposes
fullImageName: null
# -- List of extra arguments for the command, e.g.: log-format=json
extraArgs: []
# -- Optional command to override the default container command
command: []
# -- Number of OpenCost replicas to run
replicas: 1
resources:
# -- CPU/Memory resource requests
requests:
cpu: '10m'
memory: '55Mi'
# -- CPU/Memory resource limits
limits:
memory: '1Gi'
... ... ... ...
# Persistent volume claim for storing the data. eg: csv file
# 修改可以挂载永久存储
persistence:
enabled: false
# -- The path that the PV will be mounted to the exporter at
mountPath: /mnt/export
# -- Annotations for persistent volume
annotations: {}
# -- Access mode for persistent volume
accessMode: ""
# -- Storage class for persistent volume
storageClass: ""
# -- Size for persistent volume
size: ""
... ...
# 通过修改这部分可以自定义价格。
customPricing:
# -- Enables custom pricing configuration
enabled: true
# -- Customize the configmap name used for custom pricing
configmapName: custom-pricing-model
# -- Path for the pricing configuration.
configPath: /tmp/custom-config
# -- Configures the pricing model provided in the values file.
createConfigmap: true
# -- Sets the provider type for the custom pricing file.
provider: custom
# -- More information about these values here: https://www.opencost.io/docs/configuration/on-prem#custom-pricing-using-the-opencost-helm-chart
costModel:
description: Modified pricing configuration.
CPU: 1.25
spotCPU: 0.006655
RAM: 0.50
spotRAM: 0.000892
GPU: 0.95
storage: 0.25
zoneNetworkEgress: 0.01
regionNetworkEgress: 0.01
internetNetworkEgress: 0.12
retention1d: 15
retention1h: 49
# 使能碳排放成本
carbonCost:
# -- Enable carbon cost exposed in the API
enabled: true
... ... ... ...
metrics:
kubeStateMetrics:
# -- (bool) Enable emission of pod annotations
emitPodAnnotations: ~
# -- (bool) Enable emission of namespace annotations
emitNamespaceAnnotations: ~
# -- (bool) Enable emission of KSM v1 metrics
emitKsmV1Metrics: ~
# -- (bool) Enable only emission of KSM v1 metrics that do not exist in KSM 2 by default
emitKsmV1MetricsOnly: ~
... ... ... ...
config:
# -- Enables creating the metrics.json configuration as a ConfigMap
enabled: false
# -- Customize the configmap name used for metrics
configmapName: custom-metrics
# -- List of metrics to be disabled
disabledMetrics: []
# - <metric-to-be-disabled>
# - <metric-to-be-disabled>
# opencost能成功抓取prometheus的数据在于这块的配置。
prometheus:
# -- Secret name that contains credentials for Prometheus
secret_name: ~
# -- Existing secret name that contains credentials for Prometheus
existingSecretName: ~
# -- Prometheus Basic auth username
username: ""
# -- Key in the secret that references the username
username_key: DB_BASIC_AUTH_USERNAME
# -- Prometheus Basic auth password
password: ""
# -- Key in the secret that references the password
password_key: DB_BASIC_AUTH_PW
# -- Prometheus Bearer token
bearer_token: ""
bearer_token_key: DB_BEARER_TOKEN
# -- If true, opencost will use kube-rbac-proxy to authenticate with in cluster Prometheus for openshift
kubeRBACProxy: false
# -- Whether to disable SSL certificate verification
insecureSkipVerify: false
external:
# -- Use external Prometheus (eg. Grafana Cloud)
enabled: false
# -- External Prometheus url
url: "https://prometheus.example.com/prometheus"
internal:
# -- Use in-cluster Prometheus
enabled: true
# -- Service name of in-cluster Prometheus
serviceName: prometheus-server
# -- Namespace of in-cluster Prometheus
namespaceName: prometheus-system
# -- Service port of in-cluster Prometheus
port: 80
# -- Path to access the Prometheus API, this is neccesary if the Prometheus server is behind a reverse proxy(mimir) or has a different path.
path: ""
# -- Scheme to use for in-cluster Prometheus
scheme: http
amp:
# -- Use Amazon Managed Service for Prometheus (AMP)
enabled: false # If true, opencost will be configured to remote_write and query from Amazon Managed Service for Prometheus.
# -- Workspace ID for AMP
workspaceId: ""
thanos:
enabled: false
queryOffset: ''
maxSourceResolution: ''
internal:
enabled: true
serviceName: my-thanos-query
namespaceName: opencost
port: 10901
scheme: http
external:
enabled: false
url: 'https://thanos-query.example.com/thanos'
ui:
# -- Enable OpenCost UI
enabled: true
image:
# -- UI container image registry
registry: 10.118.17.28:30002
# -- UI container image name
repository: opencost/opencost-ui
# -- UI container image tag
# @default -- `""` (use appVersion in Chart.yaml)
tag: "1.118.0"
# -- UI container image pull policy
pullPolicy: IfNotPresent
# -- Override the full image name for development purposes
fullImageName: null
resources:
# -- CPU/Memory resource requests
requests:
cpu: '10m'
memory: '55Mi'
# -- CPU/Memory resource limits
limits:
memory: '1Gi'
... ... ... ...
extraVolumes: []
4.4 GPU测试用例
apiVersion: v1
kind: Namespace
metadata:
name: gpu-test
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-sleeper
namespace: gpu-test
spec:
replicas: 1
selector:
matchLabels:
app: gpu-sleeper
template:
metadata:
labels:
app: gpu-sleeper
spec:
nodeSelector:
kubernetes.io/hostname: k8s-worker-26
mars-tech.com/gpu.installed: "true"
containers:
- name: gpu-sleeper
image: 10.118.17.28:30002/linux/busybox:1.36
command: ["sh", "-c", "sleep 360000"]
resources:
requests:
mars-tech.com/gpu: "1"
limits:
mars-tech.com/gpu: "1"
