问题描述
Kubernetes 节点重启后出现 Pod 启动失败的情况,状态如下:
[root@base-k8s-master-1 ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
init-demo 0/1 Unknown 1 58d
multus-demo 0/1 Unknown 374 15d
virt-launcher-my-vm-mcwrf 0/2 UnexpectedAdmissionError 0 12d
virt-launcher-vm-alpine-dmgfp 0/3 UnexpectedAdmissionError 0 13d
virt-launcher-vm-alpine1-9fgxp 0/3 UnexpectedAdmissionError 0 4d16h
virt-launcher-vm-bridge-gdlf8 0/3 UnexpectedAdmissionError 0 27d
virt-launcher-vm-dv-2rcg5 0/2 UnexpectedAdmissionError 0 41h
virt-launcher-vm-dv-7v2jf 0/2 UnexpectedAdmissionError 0 40h
[root@base-k8s-master-1 ~]# kubectl get pod -n kube-system -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
calico-kube-controllers-5794944fb7-ngbdq 0/1 ContainerCreating 0 37m <none> base-k8s-master-1.example.com <none> <none>
calico-node-2rtlh 1/1 Running 2 (25m ago) 37m 192.168.50.132 base-k8s-worker-1.example.com <none> <none>
calico-node-st8rq 1/1 Running 1 (25m ago) 38m 192.168.50.133 base-k8s-worker-2.example.com <none> <none>
calico-node-wqtvd 1/1 Running 1 (24m ago) 37m 192.168.50.131 base-k8s-master-1.example.com <none> <none>
coredns-76f65d49f6-bp58d 0/1 Terminating 0 2m50s <none> base-k8s-master-1.example.com <none> <none>
coredns-76f65d49f6-q2knz 0/1 ContainerCreating 0 37m <none> base-k8s-worker-2.example.com <none> <none>
coredns-7c664775df-mhvp5 0/1 ContainerCreating 0 27s <none> base-k8s-master-1.example.com <none> <none>
coredns-7c664775df-nhgl8 0/1 ContainerCreating 0 27s <none> base-k8s-worker-1.example.com <none> <none>
dynamic-networks-controller-ds-7dwwc 0/1 Unknown 0 26d <none> base-k8s-worker-1.example.com <none> <none>
dynamic-networks-controller-ds-8njzv 0/1 CrashLoopBackOff 10 (4m33s ago) 26d 10.100.171.44 base-k8s-worker-2.example.com <none> <none>
dynamic-networks-controller-ds-8xpc4 0/1 Error 7 70m <none> base-k8s-master-1.example.com <none> <none>
etcd-base-k8s-master-1.example.com 1/1 Running 4 (24m ago) 263d 192.168.50.131 base-k8s-master-1.example.com <none> <none>
kube-apiserver-base-k8s-master-1.example.com 1/1 Running 3 (24m ago) 58d 192.168.50.131 base-k8s-master-1.example.com <none> <none>
kube-controller-manager-base-k8s-master-1.example.com 1/1 Running 3 (24m ago) 57d 192.168.50.131 base-k8s-master-1.example.com <none> <none>
kube-multus-ds-kqtcl 1/1 Running 2 (25m ago) 40m 192.168.50.132 base-k8s-worker-1.example.com <none> <none>
kube-multus-ds-wlm5n 1/1 Running 1 (25m ago) 38m 192.168.50.133 base-k8s-worker-2.example.com <none> <none>
kube-multus-ds-zn2ml 1/1 Running 1 (24m ago) 40m 192.168.50.131 base-k8s-master-1.example.com <none> <none>
kube-proxy-7v7ld 1/1 Running 1 (24m ago) 81m 192.168.50.131 base-k8s-master-1.example.com <none> <none>
kube-proxy-blrln 1/1 Running 2 (25m ago) 81m 192.168.50.132 base-k8s-worker-1.example.com <none> <none>
kube-proxy-pzvh4 1/1 Running 1 (25m ago) 81m 192.168.50.133 base-k8s-worker-2.example.com <none> <none>
kube-scheduler-base-k8s-master-1.example.com 1/1 Running 3 (24m ago) 57d 192.168.50.131 base-k8s-master-1.example.com <none> <none>
snapshot-controller-668549b974-62zjw 0/1 Unknown 0 14d <none> base-k8s-worker-1.example.com <none> <none>
snapshot-controller-668549b974-czdjg 0/1 Unknown 0 14d <none> base-k8s-worker-2.example.com <none> <none>
解决办法
这里只找到了解决办法,但是原因还搞不明白,已知的是节点重启后,所有需要通过 Calico 添加网卡的 Pod 创建都有问题,但是使用 Host Network 的 Pod 可以正常创建。
查看 events
:
[root@base-k8s-master-1 ~]# kubectl get events -n kube-system
16m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "a61d040e1fc531d345b2d6ecf1d5ef674c50f8b5bd7e06b3a8a74c2d4618b8fa": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
15m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "0cda0084ddfff6ed19888988cd2f23eb68c636577a9d845cf6abc8d4d9ee2533": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
14m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "bf4ee6ae0b008e4f4e46c928afb01153300632596d5ff3533548bd55ff5ade64": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
13m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "5175aa6e57461dad56735bceaa757f387ebeb041e989b786066c0df74158e82d": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
11m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "6faf0c66475ee02dc094da21fe9733ce14554926778e58b515452cbaac8675fc": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
10m Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "1c2dfbe48d316b29482de0427addcdc243635afb6092407c343576f580fe1520": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
9m31s Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "98e5486d88a69fdd19b7aca09b636b3710adceac135e58e14148def2bb249925": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
3m21s Warning FailedCreatePodSandBox pod/snapshot-controller-668549b974-czdjg (combined from similar events): Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "f33500517abd5b14e9942463614713bedf7f2a2c1ab5145e292f5731dcc7fde5": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): timed out waiting for the condition
可以看到 multus
字段,问题可能和 Multus-cni 有关,检查 Multus-cni 的 Pod:
[root@base-k8s-master-1 ~]# kubectl get pod -n kube-system
NAME READY STATUS RESTARTS AGE
...output omitted...
kube-multus-ds-d85qk 0/1 Init:CrashLoopBackOff 14 (2m19s ago) 28m
kube-multus-ds-t5vsc 0/1 Init:CrashLoopBackOff 14 (2m50s ago) 29m
kube-multus-ds-wmzrq 0/1 Init:CrashLoopBackOff 10 (3m2s ago) 29m
...output omitted...
发现 Pod 状态异常,查看 Pod 信息:
[root@base-k8s-master-1 ~]# kubectl describe pod -n kube-system kube-multus-ds-t5vsc
...output omitted...
Status: Pending
IP: 192.168.50.132
IPs:
IP: 192.168.50.132
Controlled By: DaemonSet/kube-multus-ds
Init Containers:
install-multus-binary:
Container ID: containerd://2aca0a64689f1f8e05ce81620e8b7541bb61d8f475707036651df084ed3b085b
Image: ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot-thick
Image ID: ghcr.io/k8snetworkplumbingwg/multus-cni@sha256:dfbd2361580d72e0cad10f3dd9f2f37a23c6317a363f5cca3eaaadf5ba7f43b1
Port: <none>
Host Port: <none>
Command:
cp
/usr/src/multus-cni/bin/multus-shim
/host/opt/cni/bin/multus-shim
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Message: cp: cannot create regular file '/host/opt/cni/bin/multus-shim': Text file busy
...output omitted...
可以看到 initContainers
有报错,内容为 cp: cannot create regular file '/host/opt/cni/bin/multus-shim': Text file busy
。
搜索报错信息找到类似的问题:
https://github.com/k8snetworkplumbingwg/multus-cni/issues/1221
https://github.com/k8snetworkplumbingwg/multus-cni/pull/1213
第二个链接有对应的解决办法:
kubectl patch daemonset kube-multus-ds -n kube-system --type='json' -p='[
{
"op": "replace",
"path": "/spec/template/spec/initContainers/0/command",
"value": [
"/usr/src/multus-cni/bin/install_multus",
"-d",
"/host/opt/cni/bin",
"-t",
"thick"
]
}
]'
执行后重新检查 Multus-cni 的 Pod 状态:
[root@base-k8s-master-1 ~]# kubectl get pod -n kube-system
...output omitted...
kube-multus-ds-5z9z8 1/1 Running 0 17s
kube-multus-ds-lgmrn 1/1 Running 0 17s
kube-multus-ds-wzwp7 1/1 Running 0 17s
...output omitted...
到这我以为好了,但是发现 Pod 还是有问题,找了半天,发现上边第二个链接里还有人提到了需要清理一下 Multus-cni 的配置文件:
kubectl delete ds kube-multus-ds -n kube-system # delete on every node rm -rf /etc/cni/net.d/00-multus.conf
[root@base-k8s-master-1 ~]# kubectl get pod
NAME READY STATUS RESTARTS AGE
init-demo 0/1 Unknown 2 58d
multus-demo 0/1 Unknown 0 81m
virt-launcher-vm-alpine-tpjvf 0/3 UnexpectedAdmissionError 0 154m
virt-launcher-vm-alpine1-lr4j6 0/3 UnexpectedAdmissionError 0 154m
virt-launcher-vm-bridge-gpqw6 0/3 UnexpectedAdmissionError 0 154m
virt-launcher-vm-dv-8ncfj 0/2 UnexpectedAdmissionError 0 154m
需要删除 kube-multus-ds
这个 DaemonSet
,并在所有节点删除 /etc/cni/net.d/00-multus.conf
:
[root@base-k8s-master-1 multus-cni]# kubectl delete daemonsets.apps -n kube-system kube-multus-ds
daemonset.apps "kube-multus-ds" deleted
[root@base-k8s-master-1 multus-cni]# rm -rf /etc/cni/net.d/00-multus.conf
[root@base-k8s-master-1 multus-cni]# ssh base-k8s-worker-1 rm -rf /etc/cni/net.d/00-multus.conf
[root@base-k8s-master-1 multus-cni]# ssh base-k8s-worker-2 rm -rf /etc/cni/net.d/00-multus.conf
等一会,可以看到大部分 Pod 恢复正常:
[root@base-k8s-master-1 multus-cni]# kubectl get pod
NAME READY STATUS RESTARTS AGE
init-demo 1/1 Running 3 (25m ago) 58d
multus-demo 1/1 Running 1 (25m ago) 86m
virt-launcher-my-vm-v9h92 1/2 Error 0 2m9s
virt-launcher-vm-alpine-9k4lx 3/3 Running 0 48s
virt-launcher-vm-alpine1-7tjhb 3/3 Running 0 48s
virt-launcher-vm-bridge-xszbd 3/3 Running 0 48s
virt-launcher-vm-dv-925sh 2/2 Running 0 48s
其中 virt-launcher-my-vm-v9h92
状态为 Error
是因为这是个虚拟机 Pod,使用了 Multus-cni 来添加第二块网卡:
[root@base-k8s-master-1 multus-cni]# kubectl get vm my-vm -o jsonpath='{.spec.template.spec.networks}' | jq
[
{
"name": "default",
"pod": {}
},
{
"multus": {
"networkName": "linux-bridge-br-1"
},
"name": "net1"
}
]
[root@base-k8s-master-1 multus-cni]# kubectl get network-attachment-definitions.k8s.cni.cncf.io
NAME AGE
linux-bridge-br-1 8m42s
重新创建 kube-multus-ds
:
[root@base-k8s-master-1 multus-cni]# kubectl apply -f multus-daemonset-thick.yml
customresourcedefinition.apiextensions.k8s.io/network-attachment-definitions.k8s.cni.cncf.io unchanged
clusterrole.rbac.authorization.k8s.io/multus unchanged
clusterrolebinding.rbac.authorization.k8s.io/multus unchanged
serviceaccount/multus unchanged
configmap/multus-daemon-config unchanged
daemonset.apps/kube-multus-ds created
multus-daemonset-thick.yml
是在官网下载的。
重启虚拟机:
[root@base-k8s-master-1 ~]# virtctl restart my-vm
VM my-vm was scheduled to restart
再次检查:
[root@base-k8s-master-1 multus-cni]# kubectl get pod
NAME READY STATUS RESTARTS AGE
init-demo 1/1 Running 3 (32m ago) 58d
multus-demo 1/1 Running 1 (32m ago) 93m
virt-launcher-my-vm-s7p87 2/2 Running 0 5m34s
virt-launcher-vm-alpine-9k4lx 3/3 Running 0 7m56s
virt-launcher-vm-alpine1-7tjhb 3/3 Running 0 7m56s
virt-launcher-vm-bridge-xszbd 3/3 Running 0 7m56s
virt-launcher-vm-dv-925sh 2/2 Running 0 7m56s
最后再贴一下 events
,今天没找到原因,找时间再看下失败的原因:
70m Warning FailedCreatePodSandBox pod/multus-demo Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "6b14ec48888547aaf7b0fc3643293be1f0841259c668c6400251c364b200abcf": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): failed to send CNI request: Post "http://dummy/cni": EOF: StdinData: {"capabilities":{"bandwidth":true,"portMappings":true},"clusterNetwork":"/host/etc/cni/net.d/10-calico.conflist","cniVersion":"0.3.1","logLevel":"verbose","logToStderr":true,"name":"multus-cni-network","type":"multus-shim"}
65m Warning FailedCreatePodSandBox pod/multus-demo Failed to create pod sandbox: rpc error: code = Unknown desc = failed to setup network for sandbox "327096a10ef9e0bcde5281c01ae7d5dd2f90ec8287a7f370df9836993888db31": plugin type="multus-shim" name="multus-cni-network" failed (add): CmdAdd (shim): CNI request failed with status 400: 'ContainerID:"327096a10ef9e0bcde5281c01ae7d5dd2f90ec8287a7f370df9836993888db31" Netns:"/var/run/netns/cni-828b4b68-1ce1-e14f-5e8b-40fcb15cfafe" IfName:"eth0" Args:"K8S_POD_NAMESPACE=default;K8S_POD_NAME=multus-demo;K8S_POD_INFRA_CONTAINER_ID=327096a10ef9e0bcde5281c01ae7d5dd2f90ec8287a7f370df9836993888db31;K8S_POD_UID=b208f76f-d916-4e46-ac2f-d2fc9b59cde2;IgnoreUnknown=1" Path:"" ERRORED: error configuring pod [default/multus-demo] networking: [default/multus-demo/b208f76f-d916-4e46-ac2f-d2fc9b59cde2:k8s-pod-network]: error adding container to network "k8s-pod-network": plugin type="calico" failed (add): netplugin failed with no error message: signal: killed...