categraf版本v0.3.37,n9e-edge版本v6.2.0
如下为categraf配置
[global]
print_configs = false
hostname = "sh16"
interval = 15
providers = ["local"]
concurrency = -1
[global.labels]
region = "sh16"
[log]
file_name = "/opt/categraf/conf/categraf.log"
max_size = 100
max_age = 1
max_backups = 1
local_time = true
compress = false
[writer_opt]
batch = 1000
chan_size = 1000000
[[writers]]
url = "http://127.0.0.1:19000/prometheus/v1/write"
basic_auth_user = ""
basic_auth_pass = ""
timeout = 5000
dial_timeout = 5000
max_idle_conns_per_host = 100
[http]
enable = false
address = ":9100"
print_access = false
run_mode = "release"
[ibex]
enable = false
interval = "1000ms"
servers = ["0.0.0.0:20090"]
meta_dir = "./meta"
[heartbeat]
enable = true
url = "http://127.0.0.1:19000/v1/n9e/edge/heartbeat"
interval = 10
basic_auth_user = ""
basic_auth_pass = ""
timeout = 5000
dial_timeout = 5000
max_idle_conns_per_host = 100
[prometheus]
enable = false
scrape_config_file = "/path/to/in_cluster_scrape.yaml"
log_level = "info"
如下为n9e-edge配置,敏感信息使用xxx代替
[Global]
RunMode = "release"
[CenterApi]
Addrs = ["http://xxx:17000"]
BasicAuthUser = "user001"
BasicAuthPass = "xxx"
Timeout = 9000
[Log]
Dir = "logs"
Level = "DEBUG"
Output = "stdout"
[HTTP]
Host = "0.0.0.0"
Port = 19000
CertFile = ""
KeyFile = ""
PrintAccessLog = false
PProf = false
ExposeMetrics = true
ShutdownTimeout = 30
MaxContentLength = 67108864
ReadTimeout = 20
WriteTimeout = 40
IdleTimeout = 120
[HTTP.APIForAgent]
Enable = true
[HTTP.APIForService]
Enable = true
[HTTP.APIForService.BasicAuth]
user001 = "xxx"
[Alert]
[Alert.Heartbeat]
IP = ""
Interval = 1000
EngineName = "edge"
[Pushgw]
LabelRewrite = true
[[Pushgw.Writers]]
Url = "http://127.0.0.1:9090/api/v1/write"
BasicAuthUser = ""
BasicAuthPass = ""
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
n9e做了下沉部署,边缘机房categraf与n9e edge部署在同一台设备上,但是categraf日志里经常出现与n9e edge timeout,这是什么情况?categraf日志如下:
2023/11/15 10:02:51 writer.go:96: W! push data with remote write request got error: Post "http://10.150.249.24:19000/prometheus/v1/write": dial tcp 10.150.249.24:19000: i/o timeout response body:
2023/11/15 10:02:51 writer.go:66: W! post to http://10.150.249.24:19000/prometheus/v1/write got error: Post "http://10.150.249.24:19000/prometheus/v1/write": dial tcp 10.150.249.24:19000: i/o timeout
2023/11/15 10:02:51 writer.go:67: W! example timeseries: labels:<name:"name" value:"xfusion_snmp_systemPowerState" > labels:<name:"model" value:"2288H V6" > labels:<name:"dnsname" value:"2106194WAVXHN9000021.m.sh16" > labels:<name:"region" value:"sh16" > labels:<name:"agent_hostname" value:"sh16" > labels:<name:"source" value:"2106194WAVXHN9000021" > samples:<value:2 timestamp:1700013754889 >
2023/11/15 10:02:52 heartbeat.go:148: E! failed to do heartbeat: Post "http://10.150.249.24:19000/v1/n9e/edge/heartbeat": dial tcp 10.150.249.24:19000: i/o timeout
2023/11/15 10:02:53 writer.go:96: W! push data with remote write request got error: Post "http://10.150.249.24:19000/prometheus/v1/write": dial tcp 10.150.249.24:19000: i/o timeout response body:
2023/11/15 10:02:53 writer.go:66: W! post to http://10.150.249.24:19000/prometheus/v1/write got error: Post "http://10.150.249.24:19000/prometheus/v1/write": dial tcp 10.150.249.24:19000: i/o timeout