公司的开发测试环境想部署个 docker 集群, k8s 不会 ,k3s 更不会.
目前搭配的组合是 esxi 作为基础系统 , 虚拟出多个 centos8 的系统 ,centos8 安装 docker swarm 集群, 出现了一个问题!! docker swarm 部署服务完成后,跨主机容器内都能正常 ping 但是 宿主机去访问 docker 开放的端口 访问三次只有一次成功. 具体如下:
公司路由器网关 10.0.0.1 1. server-01 10.0.0.21 (manage) 2. server-02 10.0.0.22 3. server-03 10.0.0.23 ### 防火墙全部关闭 只有 iptable server-01 $ docker swarm init --default-addr-pool 192.0.0.0/24 server-02 $ docker swarm join server-03 $ docker swarm join ### server-01 $ docker node ls ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION km7dmxn402qt0s473kpqb47ac * Server-01 Ready Active Leader 20.10.9 k5vq74oh1njscvv4mf9gpyogh Server-02 Ready Active 20.10.9 rxzmo276saehmh1rc118fdxxe Server-03 Ready Active 20.10.9 ### 网络状态如下 server-01 $ docker network inspect ingress [ { "Name": "ingress", "Id": "m7ia7lmmlu1zm0zchr13ohk4q", "Created": "2021-10-14T15:08:48.036907446+08:00", "Scope": "swarm", "Driver": "overlay", "EnableIPv6": false, "IPAM": { "Driver": "default", "Options": null, "Config": [ { "Subnet": "192.0.0.0/24", "Gateway": "192.0.0.1" } ] }, "Internal": false, "Attachable": false, "Ingress": true, "ConfigFrom": { "Network": "" }, "ConfigOnly": false, "Containers": { "ingress-sbox": { "Name": "ingress-endpoint", "EndpointID": "4b5146ca8e180dd88a5271b7d29b439f6d5995801a47d8c648379d9b51ab0b77", "MacAddress": "02:42:c0:00:00:02", "IPv4Address": "192.0.0.2/24", "IPv6Address": "" } }, "Options": { "com.docker.network.driver.overlay.vxlanid_list": "4096" }, "Labels": {}, "Peers": [ { "Name": "6ebb8868ac00", "IP": "10.0.0.21" }, { "Name": "7982d5a14bf2", "IP": "10.0.0.22" }, { "Name": "b25e17d118a4", "IP": "10.0.0.23" } ] } ] server-01 $ docker network inspect docker_gwbridge [ { "Name": "docker_gwbridge", "Id": "6f2d03207e884bfec1918d4e8fc1a1f5f14ec9e5bcd71fd409a26630ab73d413", "Created": "2021-10-14T15:08:48.422229208+08:00", "Scope": "local", "Driver": "bridge", "EnableIPv6": false, "IPAM": { "Driver": "default", "Options": null, "Config": [ { "Subnet": "172.18.0.0/16", "Gateway": "172.18.0.1" } ] }, "Internal": false, "Attachable": false, "Ingress": false, "ConfigFrom": { "Network": "" }, "ConfigOnly": false, "Containers": { "ingress-sbox": { "Name": "gateway_ingress-sbox", "EndpointID": "1c4c1b5ba462d87832710029171c3911df457c950055a369670f59cef374247b", "MacAddress": "02:42:ac:12:00:02", "IPv4Address": "172.18.0.2/16", "IPv6Address": "" } }, "Options": { "com.docker.network.bridge.enable_icc": "false", "com.docker.network.bridge.enable_ip_masquerade": "true", "com.docker.network.bridge.name": "docker_gwbridge" }, "Labels": {} } ] ### 创建 nginx 服务 server-01 $ docker service create --replicas 3 -p 80:80 --name nginx nginx server-01 $ docker service ps nginx ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS xsomsqqtkr62 nginx.1 nginx:latest Server-02 Running Running 2 minutes ago selbdoapjek0 nginx.2 nginx:latest Server-03 Running Running 2 minutes ago w5bigfn8xtz4 nginx.3 nginx:latest Server-01 Running Running 2 minutes ago server-01 $ docker service ls ID NAME MODE REPLICAS IMAGE PORTS ro33x7v9ceri nginx replicated 3/3 nginx:latest *:80->80/tcp server-01 $ docker ps -a CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 37de9b7759c9 nginx:latest "/docker-entrypoint.…" 5 minutes ago Up 5 minutes 80/tcp nginx.3.w5bigfn8xtz4pi10hoe62gi4b ## 重点来了!!! [root@Server-01 ~]# curl 10.0.0.21 --卡住 ^C [root@Server-01 ~]# curl 10.0.0.21 --卡住 ^C [root@Server-01 ~]# curl 10.0.0.21 --三次成功一次 <!DOCTYPE html> <html> <head> <title>Welcome to nginx!</title> <style> html { color-scheme: light dark; } body { width: 35em; margin: 0 auto; font-family: Tahoma, Verdana, Arial, sans-serif; } </style> </head> <body> <h1>Welcome to nginx!</h1> <p>If you see this page, the nginx web server is successfully installed and working. Further configuration is required.</p> <p>For online documentation and support please refer to <a href="http://nginx.org/">nginx.org</a>.<br/> Commercial support is available at <a href="http://nginx.com/">nginx.com</a>.</p> <p><em>Thank you for using nginx.</em></p> </body> </html> [root@Server-01 ~]# netstat -tunlp Active Internet connections (only servers) Proto Recv-Q Send-Q Local Address Foreign Address State PID/Program name tcp 0 0 192.168.122.1:53 0.0.0.0:* LISTEN 1740/dnsmasq tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 1068/sshd tcp 0 0 0.0.0.0:111 0.0.0.0:* LISTEN 1/systemd tcp6 0 0 :::22 :::* LISTEN 1068/sshd tcp6 0 0 :::2377 :::* LISTEN 1222/dockerd tcp6 0 0 :::7946 :::* LISTEN 1222/dockerd tcp6 0 0 :::111 :::* LISTEN 1/systemd tcp6 0 0 :::80 :::* LISTEN 1222/dockerd udp 0 0 192.168.122.1:53 0.0.0.0:* 1740/dnsmasq udp 0 0 0.0.0.0:67 0.0.0.0:* 1740/dnsmasq udp 0 0 0.0.0.0:111 0.0.0.0:* 1/systemd udp 0 0 0.0.0.0:4789 0.0.0.0:* - udp6 0 0 :::7946 :::* 1222/dockerd udp6 0 0 :::111 :::* 1/systemd [root@Server-01 ~]# iptables -nL --line-number Chain INPUT (policy ACCEPT) num target prot opt source destination 1 LIBVIRT_INP all -- 0.0.0.0/0 0.0.0.0/0 Chain FORWARD (policy DROP) num target prot opt source destination 1 DOCKER-USER all -- 0.0.0.0/0 0.0.0.0/0 2 DOCKER-INGRESS all -- 0.0.0.0/0 0.0.0.0/0 3 DOCKER-ISOLATION-STAGE-1 all -- 0.0.0.0/0 0.0.0.0/0 4 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 ctstate RELATED,ESTABLISHED 5 DOCKER all -- 0.0.0.0/0 0.0.0.0/0 6 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 7 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 ctstate RELATED,ESTABLISHED 8 DOCKER all -- 0.0.0.0/0 0.0.0.0/0 9 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 10 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 11 LIBVIRT_FWX all -- 0.0.0.0/0 0.0.0.0/0 12 LIBVIRT_FWI all -- 0.0.0.0/0 0.0.0.0/0 13 LIBVIRT_FWO all -- 0.0.0.0/0 0.0.0.0/0 14 DROP all -- 0.0.0.0/0 0.0.0.0/0 Chain OUTPUT (policy ACCEPT) num target prot opt source destination 1 LIBVIRT_OUT all -- 0.0.0.0/0 0.0.0.0/0 Chain LIBVIRT_INP (1 references) num target prot opt source destination 1 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:53 2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:53 3 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:67 4 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:67 Chain LIBVIRT_OUT (1 references) num target prot opt source destination 1 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:53 2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:53 3 ACCEPT udp -- 0.0.0.0/0 0.0.0.0/0 udp dpt:68 4 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:68 Chain LIBVIRT_FWO (1 references) num target prot opt source destination 1 ACCEPT all -- 192.168.122.0/24 0.0.0.0/0 2 REJECT all -- 0.0.0.0/0 0.0.0.0/0 reject-with icmp-port-unreachable Chain LIBVIRT_FWI (1 references) num target prot opt source destination 1 ACCEPT all -- 0.0.0.0/0 192.168.122.0/24 ctstate RELATED,ESTABLISHED 2 REJECT all -- 0.0.0.0/0 0.0.0.0/0 reject-with icmp-port-unreachable Chain LIBVIRT_FWX (1 references) num target prot opt source destination 1 ACCEPT all -- 0.0.0.0/0 0.0.0.0/0 Chain DOCKER (2 references) num target prot opt source destination Chain DOCKER-ISOLATION-STAGE-1 (1 references) num target prot opt source destination 1 DOCKER-ISOLATION-STAGE-2 all -- 0.0.0.0/0 0.0.0.0/0 2 DOCKER-ISOLATION-STAGE-2 all -- 0.0.0.0/0 0.0.0.0/0 3 RETURN all -- 0.0.0.0/0 0.0.0.0/0 Chain DOCKER-ISOLATION-STAGE-2 (2 references) num target prot opt source destination 1 DROP all -- 0.0.0.0/0 0.0.0.0/0 2 DROP all -- 0.0.0.0/0 0.0.0.0/0 3 RETURN all -- 0.0.0.0/0 0.0.0.0/0 Chain DOCKER-USER (1 references) num target prot opt source destination 1 RETURN all -- 0.0.0.0/0 0.0.0.0/0 Chain DOCKER-INGRESS (1 references) num target prot opt source destination 1 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 tcp dpt:80 2 ACCEPT tcp -- 0.0.0.0/0 0.0.0.0/0 state RELATED,ESTABLISHED tcp spt:80 3 RETURN all -- 0.0.0.0/0 0.0.0.0/0
1 saytesnake 2021-10-14 17:36:45 +08:00 在 esxi 网卡打开允许混合。 |
![]() | 2 defunct9 2021-10-14 19:59:36 +08:00 哦。推倒重来。你用的 swarm 过时了。直接用 docker-compose |
![]() | 3 1BF6oSYCD9ngBHo1 2021-10-14 22:36:52 +08:00 首先对楼主的测试方法有点好奇的是:你已经在 01 节点测试服务可用性的话,为什么不 curl localhost/127.0.0.1 ?你 curl 了 10 段的话,其中的测试结果应该是包含两个东西:节点服务可用性+节点本机 10 段网络的配置。 然后,作为两年前实践过用 swarm 来尝试搭过小集群的过来人( 3 台机子,每台约 10 个 service,每个 service 从 5~20 个 replicas 不等),只想说,这货就是个全的社区项目,这并不是说 swarm 不能用,只是想要达到企业级的稳定性 /安全性 /灵活性是不可能的。为什么?其实了解下 swarm 这项目出来的目的,便会发现这东西是很难达到“好用”级别的(但是不可否认还是挺好玩)。而要想做到前面说的这些,唯有 kubernete,国内也有一些服务商有提供 out-of-box 的云原生基建平台,都非常不错。但是如果你要用手把手地用 swarm 来搞,那只能祝君好运,并且玩得愉快 |
![]() | 5 liuxu 2021-10-14 23:14:01 +08:00 你要是 debian/ubuntu 的话我可以帮你详细分析下,其他的系统我就只能大致说下怎么查 首先你的 server-01 的 ip 似乎有一个 192.168.122.0/24,先确认下 server-0{1,2,3}和你本地机器的 ip 是不是在一个网段,互相 ping 一下 然后 server-0{1,2,3}的 iptable 、netstat 和 ifconfig 都看看 最后互相 curl,在双方机器上用 tcpdump 抓包看看 |
![]() | 6 ik 2021-10-14 23:18:54 +08:00 via iPhone iptables 规则问题? 三个 docker 服务都重启一下呢? |
![]() | 7 ziwen1943 2021-10-15 08:57:32 +08:00 看看防火墙和 iptables 是不是有奇奇怪怪的规则 |
8 zxkxhnqwe123 OP @vinle 三台服务器上面都是一样的 调用 curl 127.0.0.1 都是一样的效果. 并且所有系统都是干净重装好的 |
9 zxkxhnqwe123 OP @saytesnake 试过了 好像也不行 ! 叫混杂模式 |
10 zxkxhnqwe123 OP @saytesnake 主要是 我是开发人员,公司也没有专业运维,现在想解决 devops 自动化运维 测试环境,所以只能从简单的折腾 |
![]() | 11 juzisang 2021-10-15 09:30:24 +08:00 看一下这几个端口有没有开放 https://docs.docker.com/engine/swarm/swarm-tutorial/#open-protocols-and-ports-between-the-hosts 前几个月也搭了一个 swarm 集群 t/772731 |
![]() | 12 byzf 2021-10-15 10:53:59 +08:00 以前碰到过几次请求三次只成功一次的情况,有 dns 配置的问题,有负载均衡的问题。 |
![]() | 13 defunct9 2021-10-15 10:55:29 +08:00 开 ssh,让我上去看看 |
17 mepwang 2021-10-15 16:16:22 +08:00 curl -v 看看卡到哪一步了 |
18 jackleeforce3615 2021-10-15 16:53:29 +08:00 一直以为没多少人用 docker swarm 了 |
![]() | 19 mkdir 2021-10-15 17:14:21 +08:00 @jackleeforce3615 一直用一直爽 |
20 zxkxhnqwe123 OP @mepwang [root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set ^C [root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set * Connected to 127.0.0.1 (127.0.0.1) port 80 (#0) > GET / HTTP/1.1 > Host: 127.0.0.1 > User-Agent: curl/7.61.1 > Accept: */* > < HTTP/1.1 200 OK < Server: nginx/1.21.3 < Date: Fri, 15 Oct 2021 09:56:24 GMT < Content-Type: text/html < Content-Length: 615 < Last-Modified: Tue, 07 Sep 2021 15:21:03 GMT < Connection: keep-alive < ETag: "6137835f-267" < Accept-Ranges: bytes < <!DOCTYPE html> <html> <head> <title>Welcome to nginx!</title> <style> html { color-scheme: light dark; } body { width: 35em; margin: 0 auto; font-family: Tahoma, Verdana, Arial, sans-serif; } </style> </head> <body> <h1>Welcome to nginx!</h1> <p>If you see this page, the nginx web server is successfully installed and working. Further configuration is required.</p> <p>For online documentation and support please refer to <a href="http://nginx.org/">nginx.org</a>.<br/> Commercial support is available at <a href="http://nginx.com/">nginx.com</a>.</p> <p><em>Thank you for using nginx.</em></p> </body> </html> * Connection #0 to host 127.0.0.1 left intact [root@Server-01 ~]# ^C [root@Server-01 ~]# curl 127.0.0.1 -v * Rebuilt URL to: 127.0.0.1/ * Trying 127.0.0.1... * TCP_NODELAY set |
21 mepwang 2021-10-18 11:25:35 +08:00 看不出来什么问题,curl 调用三次成功一次,会不会和你的副本数量有关系? 能给的建议不多, 你把 replica 的数目改成 4 个或 2 个,看看 curl 调用成功的几率是不是变成 4 次或者 2 次成功一次。 感觉是你的 swarm 集群有点问题,直觉上是网络转发这块。 你给你的应用添加一个 overlay network 试试看? |
22 zxkxhnqwe123 OP 终于解决了 !!!! 放假花了两天时间解决了,也当学习了 . 这两天重装了 不下 20 次 ,3 台虚拟机 不停重启,重装. 原理就是开启 esxi 网卡的混杂模式 , 网卡用 E1000e (这个其实不太确定,不想验证了). 然后确认下 swarm 网关和局域网的网关是否冲突了. 这些做完就是圆满结束 感谢以上的朋友帮忙!!! 判断依据 https://stackoverflow.com/questions/59007780/container-running-on-docker-swarm-not-accessible-from-outside |
23 isnullstring 2024-08-12 10:15:23 +08:00 @zxkxhnqwe123 #22 回来留个脚印 我的情况跟楼主一样,先是确认 swarm 网关,默认是 10.0.0.0 ,跟现有一致的话肯定是不行的 环境 :exsi 6.7 + ubuntu 22.04 + 10.0.0.0 完整解决办法: 1 、虚拟交换机 开混杂模式 2 、必须修改虚拟机网卡类型,E1000 3 、初始化集群时指定 IP 段,注意 stackoverflow 中的回答 -------------------------------------------------swarm 网段---------------------通讯 IP docker swarm init --default-addr-pool 11.0.0.0/8 --advertise-addr 10.0.1.137 |
24 isnullstring 2024-08-12 10:19:14 +08:00 @isnullstring #23 还有个奇怪现象,只有 1 个管理节点和 1 个工作节点时候就没毛病,但是通过管理节点无法访问工作节点的端口,第二个节点一加进来就凉 |