小小的运维
小小的运维
发布于 2026-03-22 / 15 阅读
0
0

Go 日常巡检CLI工具分享

gowatch

分享一个很久之前学习go做的一个日常巡检工具,如果你也在学习go,作为入门的一个学习案例,还是挺不错的。

源码

package main
​
import (
  "context"
  "encoding/json"
  "flag"
  "fmt"
  "io"
  "log"
  "math"
  "os"
  "os/signal"
  "path/filepath"
  "sort"
  "strings"
  "sync"
  "syscall"
  "time"
​
  "github.com/shirou/gopsutil/v3/cpu"
  "github.com/shirou/gopsutil/v3/disk"
  "github.com/shirou/gopsutil/v3/host"
  "github.com/shirou/gopsutil/v3/load"
  "github.com/shirou/gopsutil/v3/mem"
  gnet "github.com/shirou/gopsutil/v3/net"
  "github.com/shirou/gopsutil/v3/process"
)
​
// 进程退出码,便于外部脚本区分正常结束、告警结束和运行失败。
const (
  ExitOK    = 0
  ExitAlert = 10
  ExitFail  = 20
  ExitLock  = 30
)
​
// Config 对应配置文件中的整体运行参数。
type Config struct {
  JobName       string                   `json:"jobname"`
  Interval      int                      `json:"interval"`
  Format        string                   `json:"format"`
  LogFile       string                   `json:"log_file"`
  LogLevel      string                   `json:"log_level"`
  LockFile      string                   `json:"lock_file"`
  Mountpoints   []string                 `json:"mountpoints"`
  NetIfaces     []string                 `json:"net_ifaces"`
  TopN          int                      `json:"topn"`
  IgnoreProcess []string                 `json:"ignore_process"`
  Thresholds    map[string]ThresholdRule `json:"thresholds"`
  PromTextfile  string                   `json:"prom_textfile"`
}
​
// ThresholdRule 是配置文件中的原始阈值定义。
type ThresholdRule struct {
  GT       *float64 `json:"gt"`
  LT       *float64 `json:"lt"`
  Times    int      `json:"times"`
  Mount    string   `json:"mount"`
  GTPerCPU *float64 `json:"gt_per_cpu"`
}
​
// Threshold 是运行时使用的阈值对象,已经补齐默认值。
type Threshold struct {
  Name     string
  GT       *float64
  LT       *float64
  Times    int
  Mount    string
  GTPerCPU *float64
}
​
// Alert 表示一次已经满足触发条件的告警结果。
type Alert struct {
  Type   string         `json:"type"`
  Mount  string         `json:"mount,omitempty"`
  Value  float64        `json:"value"`
  Times  int            `json:"times"`
  Reason string         `json:"reason"`
  Rule   map[string]any `json:"rule"`
}
​
// CPUInfo 汇总 CPU 数量、总利用率和每核利用率。
type CPUInfo struct {
  Count   int       `json:"count"`
  Percent float64   `json:"percent"`
  PerCPU  []float64 `json:"percpu_percent,omitempty"`
}
​
// LoadInfo 表示系统负载平均值。
type LoadInfo struct {
  Load1  float64 `json:"load1"`
  Load5  float64 `json:"load5"`
  Load15 float64 `json:"load15"`
}
​
// MemInfo 汇总内存和交换分区使用情况。
type MemInfo struct {
  Total       uint64  `json:"total"`
  Available   uint64  `json:"available"`
  Used        uint64  `json:"used"`
  Percent     float64 `json:"percent"`
  SwapTotal   uint64  `json:"swap_total"`
  SwapUsed    uint64  `json:"swap_used"`
  SwapPercent float64 `json:"swap_percent"`
}
​
// DiskUsage 记录某个挂载点的空间使用情况。
type DiskUsage struct {
  Total   uint64  `json:"total,omitempty"`
  Used    uint64  `json:"used,omitempty"`
  Free    uint64  `json:"free,omitempty"`
  Percent float64 `json:"percent,omitempty"`
  Error   string  `json:"error,omitempty"`
}
​
// DiskIO 汇总全局磁盘 IO 计数器。
type DiskIO struct {
  ReadBytes  uint64 `json:"read_bytes"`
  WriteBytes uint64 `json:"write_bytes"`
  ReadCount  uint64 `json:"read_count"`
  WriteCount uint64 `json:"write_count"`
}
​
// DiskInfo 聚合挂载点空间和全局 IO 信息。
type DiskInfo struct {
  Usage map[string]DiskUsage `json:"usage"`
  IO    DiskIO               `json:"io"`
}
​
// NetCounter 保存网卡累计收发字节数。
type NetCounter struct {
  BytesSent uint64 `json:"bytes_sent"`
  BytesRecv uint64 `json:"bytes_recv"`
}
​
// NetRate 保存两次采样之间计算出的网卡瞬时速率。
type NetRate struct {
  RxBps float64 `json:"rx_Bps"`
  TxBps float64 `json:"tx_Bps"`
  Dt    float64 `json:"dt"`
}
​
// NetInfo 聚合网卡累计值和速率。
type NetInfo struct {
  Counters map[string]NetCounter `json:"counters"`
  Rate     map[string]NetRate    `json:"rate"`
}
​
// Metrics 是单次采样的完整结果。
type Metrics struct {
  TS        string   `json:"ts"`
  Host      string   `json:"host"`
  CPU       CPUInfo  `json:"cpu"`
  Load      LoadInfo `json:"load"`
  Mem       MemInfo  `json:"mem"`
  Disk      DiskInfo `json:"disk"`
  Net       NetInfo  `json:"net"`
  UptimeSec uint64   `json:"uptime_sec"`
  BootTime  string   `json:"boot_time"`
}
​
// ProcessInfo 用于输出资源占用最高的进程明细。
type ProcessInfo struct {
  PID        int32   `json:"pid"`
  Name       string  `json:"name"`
  User       string  `json:"user"`
  CPUPercent float64 `json:"cpu_percent"`
  RSS        uint64  `json:"rss"`
  Cmdline    string  `json:"cmdline"`
}
​
// NetSnapshot 保存上一次网卡采样结果,用于计算速率。
type NetSnapshot struct {
  CapturedAt    time.Time
  CountersByNIC map[string]NetCounter
}
​
// Collector 负责从 gopsutil 读取系统指标。
type Collector struct {
  mountpoints         []string
  netIfaces           []string
  previousNetSnapshot *NetSnapshot
}
​
// AlertEngine 根据阈值配置判断本次采样是否触发告警。
type AlertEngine struct {
  thresholds []Threshold
  hitCounts  map[string]int
}
​
// RotateWriter 在日志文件达到上限时执行轮转。
type RotateWriter struct {
  mu          sync.Mutex
  path        string
  maxBytes    int64
  backupCount int
  currentFile *os.File
}
​
// AppLogger 封装文本和 JSON 两种日志输出格式。
type AppLogger struct {
  logger *log.Logger
  format string
  debug  bool
}
​
// FileLock 保证同一 job 只有一个实例运行。
type FileLock struct {
  path     string
  lockFile *os.File
}
​
func isoNow() string {
  return time.Now().Format(time.RFC3339)
}
​
func maxInt(left, right int) int {
  if left > right {
    return left
  }
  return right
}
​
func readJSONFile(path string, target any) error {
  fileContent, err := os.ReadFile(path)
  if err != nil {
    return err
  }
  return json.Unmarshal(fileContent, target)
}
​
func ensureDir(path string) error {
  if path == "" || path == "." {
    return nil
  }
  return os.MkdirAll(path, 0o755)
}
​
// atomicWriteText 先写入临时文件再替换目标文件,避免读到半截内容。
func atomicWriteText(path, content string) error {
  dirPath := filepath.Dir(path)
  if err := ensureDir(dirPath); err != nil {
    return err
  }
  tempFile, err := os.CreateTemp(dirPath, ".tmp_*")
  if err != nil {
    return err
  }
  tempFilePath := tempFile.Name()
  defer os.Remove(tempFilePath)
​
  if _, err := tempFile.WriteString(content); err != nil {
    tempFile.Close()
    return err
  }
  if err := tempFile.Sync(); err != nil {
    tempFile.Close()
    return err
  }
  if err := tempFile.Close(); err != nil {
    return err
  }
  return os.Rename(tempFilePath, path)
}
​
// newRotateWriter 创建带自动轮转能力的文件写入器。
func newRotateWriter(path string, maxBytes int64, backupCount int) (*RotateWriter, error) {
  rotateWriter := &RotateWriter{path: path, maxBytes: maxBytes, backupCount: backupCount}
  if err := rotateWriter.open(); err != nil {
    return nil, err
  }
  return rotateWriter, nil
}
​
func (writer *RotateWriter) open() error {
  if err := ensureDir(filepath.Dir(writer.path)); err != nil {
    return err
  }
  logFile, err := os.OpenFile(writer.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
  if err != nil {
    return err
  }
  writer.currentFile = logFile
  return nil
}
​
func (writer *RotateWriter) rotate() error {
  if writer.currentFile != nil {
    _ = writer.currentFile.Close()
    writer.currentFile = nil
  }
  // 先倒序移动旧备份,避免覆盖还没有腾挪的位置。
  for backupIndex := writer.backupCount - 1; backupIndex >= 1; backupIndex-- {
    oldPath := fmt.Sprintf("%s.%d", writer.path, backupIndex)
    newPath := fmt.Sprintf("%s.%d", writer.path, backupIndex+1)
    if _, err := os.Stat(oldPath); err == nil {
      _ = os.Rename(oldPath, newPath)
    }
  }
  if _, err := os.Stat(writer.path); err == nil {
    _ = os.Rename(writer.path, writer.path+".1")
  }
  return writer.open()
}
​
func (writer *RotateWriter) Write(data []byte) (int, error) {
  writer.mu.Lock()
  defer writer.mu.Unlock()
​
  if writer.currentFile == nil {
    if err := writer.open(); err != nil {
      return 0, err
    }
  }
  fileInfo, err := writer.currentFile.Stat()
  if err == nil && fileInfo.Size()+int64(len(data)) > writer.maxBytes && writer.maxBytes > 0 {
    if err := writer.rotate(); err != nil {
      return 0, err
    }
  }
  return writer.currentFile.Write(data)
}
​
// newLogger 根据配置组合标准输出和可选的轮转日志文件。
func newLogger(logFile, format string, debug bool) (*AppLogger, error) {
  writers := []io.Writer{os.Stdout}
  if logFile != "" {
    rotateWriter, err := newRotateWriter(logFile, 10*1024*1024, 7)
    if err != nil {
      return nil, err
    }
    writers = append(writers, rotateWriter)
  }
  baseLogger := log.New(io.MultiWriter(writers...), "", 0)
  return &AppLogger{logger: baseLogger, format: format, debug: debug}, nil
}
​
func (appLogger *AppLogger) log(level, msg string, fields map[string]any) {
  if level == "DEBUG" && !appLogger.debug {
    return
  }
  if fields == nil {
    fields = map[string]any{}
  }
  if appLogger.format == "json" {
    logFields := map[string]any{
      "ts":    isoNow(),
      "level": level,
      "msg":   msg,
    }
    for fieldName, fieldValue := range fields {
      logFields[fieldName] = fieldValue
    }
    jsonPayload, _ := json.Marshal(logFields)
    appLogger.logger.Println(string(jsonPayload))
    return
  }
​
  var fieldParts []string
  for fieldName, fieldValue := range fields {
    fieldParts = append(fieldParts, fmt.Sprintf("%s=%v", fieldName, fieldValue))
  }
  sort.Strings(fieldParts)
  line := fmt.Sprintf("%s %-5s %s", isoNow(), level, msg)
  if len(fieldParts) > 0 {
    line += " " + strings.Join(fieldParts, " ")
  }
  appLogger.logger.Println(line)
}
​
func (appLogger *AppLogger) Info(msg string, fields map[string]any) {
  appLogger.log("INFO", msg, fields)
}
func (appLogger *AppLogger) Warn(msg string, fields map[string]any) {
  appLogger.log("WARN", msg, fields)
}
func (appLogger *AppLogger) Error(msg string, fields map[string]any) {
  appLogger.log("ERROR", msg, fields)
}
func (appLogger *AppLogger) Debug(msg string, fields map[string]any) {
  appLogger.log("DEBUG", msg, fields)
}
​
// Acquire 使用 flock 获取非阻塞独占锁,避免同一任务重复运行。
func (lock *FileLock) Acquire() error {
  if err := ensureDir(filepath.Dir(lock.path)); err != nil {
    return err
  }
  lockHandle, err := os.OpenFile(lock.path, os.O_CREATE|os.O_RDWR, 0o644)
  if err != nil {
    return err
  }
  if err := syscall.Flock(int(lockHandle.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
    _ = lockHandle.Close()
    return err
  }
  if err := lockHandle.Truncate(0); err == nil {
    _, _ = lockHandle.WriteString(fmt.Sprintf("%d\n", os.Getpid()))
  }
  lock.lockFile = lockHandle
  return nil
}
​
func (lock *FileLock) Release() {
  if lock.lockFile == nil {
    return
  }
  _ = syscall.Flock(int(lock.lockFile.Fd()), syscall.LOCK_UN)
  _ = lock.lockFile.Close()
  lock.lockFile = nil
}
​
// parseThresholds 把配置中的阈值规则整理成运行时结构,并补齐默认触发次数。
func parseThresholds(config Config) []Threshold {
  var parsedThresholds []Threshold
  for name, rule := range config.Thresholds {
    requiredHits := rule.Times
    if requiredHits <= 0 {
      requiredHits = 1
    }
    parsedThresholds = append(parsedThresholds, Threshold{
      Name:     name,
      GT:       rule.GT,
      LT:       rule.LT,
      Times:    requiredHits,
      Mount:    rule.Mount,
      GTPerCPU: rule.GTPerCPU,
    })
  }
  return parsedThresholds
}
​
// Check 判断单个指标值是否越过阈值,并返回告警原因文本。
func (threshold Threshold) Check(value float64, cpuCount int) (bool, string) {
  if threshold.Name == "load1" && threshold.GTPerCPU != nil {
    loadLimit := *threshold.GTPerCPU * float64(maxInt(cpuCount, 1))
    return value > loadLimit, fmt.Sprintf("load1 %.2f > %.2f (gt_per_cpu=%.2f)", value, loadLimit, *threshold.GTPerCPU)
  }
  if threshold.GT != nil && value > *threshold.GT {
    return true, fmt.Sprintf("%s %.2f > %.2f", threshold.Name, value, *threshold.GT)
  }
  if threshold.LT != nil && value < *threshold.LT {
    return true, fmt.Sprintf("%s %.2f < %.2f", threshold.Name, value, *threshold.LT)
  }
  return false, ""
}
​
func newAlertEngine(thresholds []Threshold) *AlertEngine {
  return &AlertEngine{thresholds: thresholds, hitCounts: map[string]int{}}
}
​
// Evaluate 根据本次采样结果更新连续命中次数,并返回触发的告警列表。
func (engine *AlertEngine) Evaluate(metrics Metrics) (bool, []Alert) {
  var alerts []Alert
  hasAlert := false
​
  getMetricValue := func(threshold Threshold) (float64, bool) {
    switch threshold.Name {
    case "cpu_percent":
      return metrics.CPU.Percent, true
    case "mem_percent":
      return metrics.Mem.Percent, true
    case "load1":
      return metrics.Load.Load1, true
    case "disk_percent":
      mountpoint := threshold.Mount
      if mountpoint == "" {
        mountpoint = "/"
      }
      diskUsage, ok := metrics.Disk.Usage[mountpoint]
      if !ok || diskUsage.Error != "" {
        return 0, false
      }
      return diskUsage.Percent, true
    default:
      return 0, false
    }
  }
​
  for _, threshold := range engine.thresholds {
    currentValue, ok := getMetricValue(threshold)
    alertKey := threshold.Name
    if threshold.Mount != "" {
      alertKey += ":" + threshold.Mount
    }
    if !ok {
      engine.hitCounts[alertKey] = 0
      continue
    }
    hit, reason := threshold.Check(currentValue, metrics.CPU.Count)
    if hit {
      engine.hitCounts[alertKey]++
    } else {
      engine.hitCounts[alertKey] = 0
    }
    if hit && engine.hitCounts[alertKey] >= maxInt(threshold.Times, 1) {
      hasAlert = true
      alerts = append(alerts, Alert{
        Type:   threshold.Name,
        Mount:  threshold.Mount,
        Value:  currentValue,
        Times:  engine.hitCounts[alertKey],
        Reason: reason,
        Rule: map[string]any{
          "gt":         threshold.GT,
          "lt":         threshold.LT,
          "times":      threshold.Times,
          "gt_per_cpu": threshold.GTPerCPU,
        },
      })
    }
  }
  return hasAlert, alerts
}
​
// warmUpCPUAndProcesses 先做一次预热采样,避免第一次百分比统计明显失真。
func warmUpCPUAndProcesses() {
  _, _ = cpu.Percent(0, false)
  _, _ = cpu.Percent(0, true)
  processList, _ := process.Processes()
  for _, proc := range processList {
    _, _ = proc.Percent(0)
  }
}
​
func newCollector(mountpoints, netIfaces []string) *Collector {
  if len(mountpoints) == 0 {
    mountpoints = []string{"/"}
  }
  return &Collector{mountpoints: mountpoints, netIfaces: netIfaces}
}
​
// Collect 执行一次完整采样,按 CPU、内存、磁盘、网络等维度组装结果。
func (collector *Collector) Collect() (Metrics, error) {
  var metrics Metrics
  metrics.TS = isoNow()
​
  hostInfo, err := host.Info()
  if err != nil {
    return metrics, err
  }
  metrics.Host = hostInfo.Hostname
  metrics.UptimeSec = hostInfo.Uptime
  metrics.BootTime = time.Unix(int64(hostInfo.BootTime), 0).Format(time.RFC3339)
​
  totalCPUPercentages, err := cpu.Percent(0, false)
  if err != nil {
    return metrics, err
  }
  perCPUPercentages, err := cpu.Percent(0, true)
  if err != nil {
    return metrics, err
  }
  cpuCount, _ := cpu.Counts(true)
  totalCPUPercent := 0.0
  if len(totalCPUPercentages) > 0 {
    totalCPUPercent = totalCPUPercentages[0]
  }
  metrics.CPU = CPUInfo{Count: maxInt(cpuCount, 1), Percent: totalCPUPercent, PerCPU: perCPUPercentages}
​
  loadAverage, err := load.Avg()
  if err == nil {
    metrics.Load = LoadInfo{Load1: loadAverage.Load1, Load5: loadAverage.Load5, Load15: loadAverage.Load15}
  }
​
  virtualMemoryStat, err := mem.VirtualMemory()
  if err != nil {
    return metrics, err
  }
  swapMemoryStat, _ := mem.SwapMemory()
  metrics.Mem = MemInfo{
    Total:       virtualMemoryStat.Total,
    Available:   virtualMemoryStat.Available,
    Used:        virtualMemoryStat.Used,
    Percent:     virtualMemoryStat.UsedPercent,
    SwapTotal:   swapMemoryStat.Total,
    SwapUsed:    swapMemoryStat.Used,
    SwapPercent: swapMemoryStat.UsedPercent,
  }
​
  diskUsageByMount := map[string]DiskUsage{}
  for _, mountpoint := range collector.mountpoints {
    diskUsageStat, err := disk.Usage(mountpoint)
    if err != nil {
      diskUsageByMount[mountpoint] = DiskUsage{Error: err.Error()}
      continue
    }
    diskUsageByMount[mountpoint] = DiskUsage{
      Total:   diskUsageStat.Total,
      Used:    diskUsageStat.Used,
      Free:    diskUsageStat.Free,
      Percent: diskUsageStat.UsedPercent,
    }
  }
​
  diskIOCounters, _ := disk.IOCounters()
  diskIO := DiskIO{}
  for _, diskCounter := range diskIOCounters {
    diskIO.ReadBytes += diskCounter.ReadBytes
    diskIO.WriteBytes += diskCounter.WriteBytes
    diskIO.ReadCount += diskCounter.ReadCount
    diskIO.WriteCount += diskCounter.WriteCount
  }
  metrics.Disk = DiskInfo{Usage: diskUsageByMount, IO: diskIO}
​
  nicCounters, err := gnet.IOCounters(true)
  if err != nil {
    return metrics, err
  }
  selectedInterfaces := map[string]bool{}
  if len(collector.netIfaces) > 0 {
    for _, iface := range collector.netIfaces {
      selectedInterfaces[iface] = true
    }
  }
​
  currentNetCounters := map[string]NetCounter{}
  for _, nicCounter := range nicCounters {
    if len(selectedInterfaces) > 0 && !selectedInterfaces[nicCounter.Name] {
      continue
    }
    currentNetCounters[nicCounter.Name] = NetCounter{
      BytesSent: nicCounter.BytesSent,
      BytesRecv: nicCounter.BytesRecv,
    }
  }
  netRates := map[string]NetRate{}
  now := time.Now()
  if collector.previousNetSnapshot != nil {
    elapsedSeconds := now.Sub(collector.previousNetSnapshot.CapturedAt).Seconds()
    if elapsedSeconds < 1e-6 {
      elapsedSeconds = 1e-6
    }
    // 网卡速率由两次累计值相减后除以采样间隔得到。
    for iface, currentCounter := range currentNetCounters {
      previousCounter, ok := collector.previousNetSnapshot.CountersByNIC[iface]
      if !ok {
        continue
      }
      rxBps := float64(currentCounter.BytesRecv-previousCounter.BytesRecv) / elapsedSeconds
      txBps := float64(currentCounter.BytesSent-previousCounter.BytesSent) / elapsedSeconds
      netRates[iface] = NetRate{RxBps: rxBps, TxBps: txBps, Dt: elapsedSeconds}
    }
  }
  collector.previousNetSnapshot = &NetSnapshot{CapturedAt: now, CountersByNIC: currentNetCounters}
  metrics.Net = NetInfo{Counters: currentNetCounters, Rate: netRates}
​
  return metrics, nil
}
​
// topNProcesses 返回 CPU 和内存占用最高的前 N 个进程。
func topNProcesses(topN int, ignoreNames []string) map[string][]ProcessInfo {
  ignoredProcessNames := map[string]bool{}
  for _, processName := range ignoreNames {
    ignoredProcessNames[processName] = true
  }
​
  processList, err := process.Processes()
  if err != nil {
    return map[string][]ProcessInfo{"cpu": {}, "mem": {}}
  }
​
  processInfos := make([]ProcessInfo, 0, len(processList))
  for _, proc := range processList {
    processName, err := proc.Name()
    if err != nil || ignoredProcessNames[processName] {
      continue
    }
    username, _ := proc.Username()
    commandLine, _ := proc.Cmdline()
    memoryInfo, err := proc.MemoryInfo()
    if err != nil {
      continue
    }
    cpuPercent, _ := proc.Percent(0)
    processInfos = append(processInfos, ProcessInfo{
      PID:        proc.Pid,
      Name:       processName,
      User:       username,
      CPUPercent: cpuPercent,
      RSS:        memoryInfo.RSS,
      Cmdline:    truncate(commandLine, 500),
    })
  }
​
  processesByCPU := append([]ProcessInfo(nil), processInfos...)
  sort.Slice(processesByCPU, func(i, j int) bool { return processesByCPU[i].CPUPercent > processesByCPU[j].CPUPercent })
  processesByMemory := append([]ProcessInfo(nil), processInfos...)
  sort.Slice(processesByMemory, func(i, j int) bool { return processesByMemory[i].RSS > processesByMemory[j].RSS })
​
  if topN < 0 {
    topN = 0
  }
  if len(processesByCPU) > topN {
    processesByCPU = processesByCPU[:topN]
  }
  if len(processesByMemory) > topN {
    processesByMemory = processesByMemory[:topN]
  }
  return map[string][]ProcessInfo{"cpu": processesByCPU, "mem": processesByMemory}
}
​
func truncate(value string, maxLength int) string {
  if len(value) <= maxLength {
    return value
  }
  return value[:maxLength]
}
​
// promEscape 对 label 值做最小转义,避免生成非法的 Prometheus 文本格式。
func promEscape(value string) string {
  value = strings.ReplaceAll(value, `\`, `\\`)
  value = strings.ReplaceAll(value, "\n", `\n`)
  value = strings.ReplaceAll(value, `"`, `\"`)
  return value
}
​
// buildPromText 把采样结果和告警状态导出为 Prometheus textfile 格式。
func buildPromText(metrics Metrics, alerts []Alert) string {
  var promBuilder strings.Builder
  fmt.Fprintf(&promBuilder, "# HELP gowatch_cpu_percent CPU usage percent\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_cpu_percent gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_cpu_percent %.3f\n", metrics.CPU.Percent)
​
  fmt.Fprintf(&promBuilder, "# HELP gowatch_mem_percent Memory usage percent\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_mem_percent gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_mem_percent %.3f\n", metrics.Mem.Percent)
​
  fmt.Fprintf(&promBuilder, "# HELP gowatch_load1 System load1\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_load1 gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_load1 %.3f\n", metrics.Load.Load1)
​
  fmt.Fprintf(&promBuilder, "# HELP gowatch_disk_percent Disk usage percent by mount\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_disk_percent gauge\n")
  mountpoints := make([]string, 0, len(metrics.Disk.Usage))
  for mountpoint := range metrics.Disk.Usage {
    mountpoints = append(mountpoints, mountpoint)
  }
  sort.Strings(mountpoints)
  for _, mountpoint := range mountpoints {
    diskUsage := metrics.Disk.Usage[mountpoint]
    if diskUsage.Error != "" {
      continue
    }
    fmt.Fprintf(&promBuilder, "gowatch_disk_percent{mount=\"%s\"} %.3f\n", promEscape(mountpoint), diskUsage.Percent)
  }
​
  fmt.Fprintf(&promBuilder, "# HELP gowatch_net_rx_bytes_per_sec Network RX rate bytes/s\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_net_rx_bytes_per_sec gauge\n")
  fmt.Fprintf(&promBuilder, "# HELP gowatch_net_tx_bytes_per_sec Network TX rate bytes/s\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_net_tx_bytes_per_sec gauge\n")
  interfaceNames := make([]string, 0, len(metrics.Net.Rate))
  for interfaceName := range metrics.Net.Rate {
    interfaceNames = append(interfaceNames, interfaceName)
  }
  sort.Strings(interfaceNames)
  for _, interfaceName := range interfaceNames {
    netRate := metrics.Net.Rate[interfaceName]
    fmt.Fprintf(&promBuilder, "gowatch_net_rx_bytes_per_sec{iface=\"%s\"} %.3f\n", promEscape(interfaceName), netRate.RxBps)
    fmt.Fprintf(&promBuilder, "gowatch_net_tx_bytes_per_sec{iface=\"%s\"} %.3f\n", promEscape(interfaceName), netRate.TxBps)
  }
​
  activeAlerts := map[string]bool{}
  for _, alert := range alerts {
    activeAlerts[alert.Type] = true
  }
  knownAlertTypes := []string{"cpu_percent", "mem_percent", "disk_percent", "load1"}
  fmt.Fprintf(&promBuilder, "# HELP gowatch_alert_active Alert active (0/1) by type\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_alert_active gauge\n")
  for _, alertType := range knownAlertTypes {
    activeValue := 0
    if activeAlerts[alertType] {
      activeValue = 1
    }
    fmt.Fprintf(&promBuilder, "gowatch_alert_active{type=\"%s\"} %d\n", promEscape(alertType), activeValue)
  }
  return promBuilder.String()
}
​
// sampleSummary 生成简洁的人类可读采样摘要。
func sampleSummary(jobName string, metrics Metrics, hasAlert bool) string {
  diskRoot := "n/a"
  if rootDiskUsage, ok := metrics.Disk.Usage["/"]; ok && rootDiskUsage.Error == "" {
    diskRoot = fmt.Sprintf("%.1f", rootDiskUsage.Percent)
  }
  return fmt.Sprintf("job=%s cpu=%.1f%% mem=%.1f%% load1=%.2f disk/=%s alert=%v",
    jobName, metrics.CPU.Percent, metrics.Mem.Percent, metrics.Load.Load1, diskRoot, hasAlert)
}
​
// configValueInt 按命令行、配置文件、默认值的顺序选择最终数值参数。
func configValueInt(flagValue, configValue, defaultValue int) int {
  if flagValue >= 0 {
    return flagValue
  }
  if configValue > 0 {
    return configValue
  }
  return defaultValue
}
​
// configValueString 按命令行、配置文件、默认值的顺序选择最终字符串参数。
func configValueString(flagValue, configValue, defaultValue string) string {
  if flagValue != "" {
    return flagValue
  }
  if configValue != "" {
    return configValue
  }
  return defaultValue
}
​
// run 是主执行循环:采样、判定告警、记录日志,并处理退出条件。
func run(ctx context.Context, config Config, once bool, intervalSec, durationSec int, format, promTextfilePath string, debug bool) int {
  jobName := config.JobName
  if jobName == "" {
    jobName = "gowatch"
  }
  intervalSec = maxInt(intervalSec, 1)
  if config.TopN <= 0 {
    config.TopN = 5
  }
​
  logger, err := newLogger(config.LogFile, format, debug || strings.EqualFold(config.LogLevel, "DEBUG"))
  if err != nil {
    fmt.Fprintf(os.Stderr, "init logger failed: %v\n", err)
    return ExitFail
  }
​
  lockPath := config.LockFile
  if lockPath == "" {
    lockPath = fmt.Sprintf("/tmp/gowatch_%s.lock", jobName)
  }
  fileLock := &FileLock{path: lockPath}
  if err := fileLock.Acquire(); err != nil {
    logger.Warn("lock not acquired, another instance may be running", map[string]any{"job": jobName, "lock": lockPath, "err": err.Error()})
    return ExitLock
  }
  defer fileLock.Release()
​
  warmUpCPUAndProcesses()
​
  collector := newCollector(config.Mountpoints, config.NetIfaces)
  engine := newAlertEngine(parseThresholds(config))
  startedAt := time.Now()
  anyAlert := false
​
  for {
    loopStartedAt := time.Now()
    metrics, err := collector.Collect()
    if err != nil {
      logger.Error("collect failed", map[string]any{"job": jobName, "err": err.Error()})
      return ExitFail
    }
​
    hasAlert, alerts := engine.Evaluate(metrics)
    anyAlert = anyAlert || hasAlert
​
    if format == "json" {
      logger.Info("sample", map[string]any{"job": jobName, "metrics": metrics, "alert": hasAlert, "alerts": alerts})
    } else {
      logger.Info(sampleSummary(jobName, metrics, hasAlert), map[string]any{"job": jobName})
    }
​
    if hasAlert {
      topProcesses := topNProcesses(config.TopN, config.IgnoreProcess)
      if format == "json" {
        logger.Warn("alert_detail", map[string]any{"job": jobName, "alerts": alerts, "topn": topProcesses})
      } else {
        logger.Warn("ALERT", map[string]any{"job": jobName, "alerts": alerts})
        logger.Warn("TopCPU", map[string]any{"job": jobName, "procs": topProcesses["cpu"]})
        logger.Warn("TopMEM", map[string]any{"job": jobName, "procs": topProcesses["mem"]})
      }
    }
​
    if promTextfilePath != "" {
      if err := atomicWriteText(promTextfilePath, buildPromText(metrics, alerts)); err != nil {
        logger.Error("prom textfile write failed", map[string]any{"job": jobName, "err": err.Error(), "path": promTextfilePath})
      }
    }
​
    if once {
      if hasAlert {
        return ExitAlert
      }
      return ExitOK
    }
​
    if durationSec > 0 && time.Since(startedAt) >= time.Duration(durationSec)*time.Second {
      logger.Info("duration reached, exiting", map[string]any{"job": jobName, "duration": durationSec})
      break
    }
​
    collectDuration := time.Since(loopStartedAt)
    if collectDuration > time.Duration(intervalSec)*time.Second {
      logger.Warn("collector_overrun", map[string]any{
        "job":          jobName,
        "spent_sec":    math.Round(collectDuration.Seconds()*1000) / 1000,
        "interval_sec": intervalSec,
      })
    } else {
      // 未超时则等待到下一个采样周期,同时继续监听退出信号。
      select {
      case <-ctx.Done():
        logger.Info("received stop signal, exiting", map[string]any{"job": jobName})
        if anyAlert {
          return ExitAlert
        }
        return ExitOK
      case <-time.After(time.Duration(intervalSec)*time.Second - collectDuration):
      }
    }
​
    select {
    case <-ctx.Done():
      logger.Info("received stop signal, exiting", map[string]any{"job": jobName})
      if anyAlert {
        return ExitAlert
      }
      return ExitOK
    default:
    }
  }
​
  if anyAlert {
    return ExitAlert
  }
  return ExitOK
}
​
// main 负责解析参数、加载配置,并接入系统信号控制主循环退出。
func main() {
  var (
    configPath   = flag.String("config", "", "config.json path")
    once         = flag.Bool("once", false, "collect once and exit")
    interval     = flag.Int("interval", -1, "override interval seconds")
    duration     = flag.Int("duration", -1, "run duration seconds")
    format       = flag.String("format", "", "log format: json|text")
    promTextfile = flag.String("prom-textfile", "", "prometheus textfile output path")
    debug        = flag.Bool("debug", false, "enable debug logging")
  )
  flag.Parse()
​
  if *configPath == "" {
    fmt.Fprintln(os.Stderr, "--config is required")
    os.Exit(ExitFail)
  }
​
  var config Config
  if err := readJSONFile(*configPath, &config); err != nil {
    fmt.Fprintf(os.Stderr, "read config failed: %v\n", err)
    os.Exit(ExitFail)
  }
​
  finalFormat := configValueString(*format, config.Format, "json")
  finalPromPath := configValueString(*promTextfile, config.PromTextfile, "")
  finalInterval := configValueInt(*interval, config.Interval, 5)
  finalDuration := *duration
  if finalDuration < 0 {
    finalDuration = 0
  }
​
  ctx, cancel := context.WithCancel(context.Background())
  defer cancel()
​
  signalCh := make(chan os.Signal, 1)
  signalNotify(signalCh)
  go func() {
    <-signalCh
    cancel()
  }()
​
  exitCode := run(ctx, config, *once, finalInterval, finalDuration, finalFormat, finalPromPath, *debug)
  os.Exit(exitCode)
}
​
// signalNotify 统一注册常见的终止信号。
func signalNotify(signalCh chan<- os.Signal) {
  signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM)
}

1. 这个工具能做什么

  • 周期性采集主机运行状态,包括 CPU、Load、内存、Swap、磁盘、网络、开机时间和运行时长。

  • 支持一次性采样,也支持按固定间隔持续运行。

  • 支持配置运行时长,到达指定时间后自动退出。

  • 支持按挂载点采集磁盘使用率,默认至少会关注根分区 /

  • 支持按网卡白名单采集网络流量,并计算每秒收发速率。

  • 支持基于阈值做告警判断,当前支持 cpu_percentmem_percentdisk_percentload1

  • 支持连续命中次数控制,避免单次抖动直接触发告警。

  • 支持按 CPU 核数计算 load1 阈值,适合多核机器。

  • 告警触发时会额外输出 CPU 和内存占用最高的进程列表,方便定位问题。

  • 支持忽略指定进程名,避免监控结果被无关进程干扰。

  • 支持 jsontext 两种日志格式。

  • 支持同时输出到标准输出和日志文件。

  • 如果配置了日志文件,内置日志轮转,单文件达到 10 MB 后自动切分,默认保留 7 份备份。

  • 支持导出 Prometheus textfile,可直接配合 Node Exporter 的 textfile collector 使用。

  • 通过文件锁保证同一个 job 只运行一个实例,避免重复采样。

  • 支持响应 SIGINTSIGTERM,适合被 systemd、crontab 或脚本调度。

2. 适合的使用场景

  • 给一台机器做轻量的本地巡检

  • 被 crontab 或 systemd 定时拉起

  • 作为批处理任务前后的健康检查

  • 把系统关键指标以 JSON 打到日志平台

  • 把告警状态和资源指标暴露给 Prometheus

3. 采集内容

  • CPU: 总使用率、每核使用率、CPU 核数

  • Load: load1load5load15

  • 内存: 总量、可用量、已用量、使用率

  • Swap: 总量、已用量、使用率

  • 磁盘: 每个挂载点的总量、已用量、剩余量、使用率

  • 磁盘 IO: 全局读写字节数、读写次数

  • 网络: 每块网卡的累计收发字节数、每秒收发速率

  • 主机信息: 主机名、开机时间、运行时长、采样时间

4. 告警能力

支持的阈值类型如下:

  • cpu_percent: CPU 总使用率

  • mem_percent: 内存使用率

  • disk_percent: 指定挂载点的磁盘使用率

  • load1: 1 分钟平均负载

每条阈值规则支持这些能力:

  • gt: 大于该值时触发

  • lt: 小于该值时触发

  • times: 连续命中多少次后才真正告警

  • mount: 仅用于 disk_percent,指定要检查的挂载点

  • gt_per_cpu: 仅用于 load1,按 CPU 核数 x 阈值 计算上限

告警触发后:

  • 单次采样会返回退出码 10

  • 持续运行模式下会记录告警详情

  • 会输出 Top N CPU 进程和 Top N 内存进程

5. 输出形式

5.1 文本日志

适合直接在终端查看,输出类似:

2026-03-21T12:00:00+08:00 INFO  job=gowatch cpu=13.7% mem=61.2% load1=1.25 disk/=72.4 alert=false

5.2 JSON 日志

适合被日志系统或脚本消费,采样结果和告警详情都会以 JSON 输出。

5.3 Prometheus textfile

如果指定 --prom-textfile 或配置项 prom_textfile,会输出这些指标:

  • gowatch_cpu_percent

  • gowatch_mem_percent

  • gowatch_load1

  • gowatch_disk_percent

  • gowatch_net_rx_bytes_per_sec

  • gowatch_net_tx_bytes_per_sec

  • gowatch_alert_active

6. 命令行参数

参数

说明

--config

配置文件路径,必填

--once

只采样一次后退出

--interval

覆盖配置文件中的采样间隔,单位秒

--duration

运行总时长,单位秒;到时自动退出

--format

日志格式,支持 jsontext

--prom-textfile

Prometheus textfile 输出路径

--debug

打开调试日志

7. 配置项说明

配置项

说明

jobname

任务名称,会出现在日志和锁文件名中

interval

采样间隔,默认值为 5 秒

format

输出格式,默认值为 json

log_file

日志文件路径;配置后会同时写终端和文件

log_level

当前主要用于开启 DEBUG 级别日志

lock_file

文件锁路径;未配置时会自动落到 /tmp

mountpoints

需要采集的挂载点列表

net_ifaces

需要采集的网卡列表;为空时采集全部网卡

topn

告警时输出多少个高占用进程,默认值为 5

ignore_process

告警时需要忽略的进程名列表

thresholds

告警规则集合

prom_textfile

Prometheus textfile 输出路径

8. 退出码

退出码

含义

0

正常结束,没有告警

10

运行过程中出现过告警,或单次采样命中告警

20

采集、配置读取或初始化失败

30

没拿到文件锁,通常表示已有同名任务在运行

9. 快速开始

9.1 构建

go build -o gowatch .

config.json 示例配置

{
  "jobname": "gowatch",
  "interval": 5,
  "format": "json",
  "log_file": "/tmp/gowatch/gowatch.log",
  "log_level": "INFO",
  "lock_file": "/tmp/gowatch/gowatch.lock",
  "mountpoints": ["/"],
  "net_ifaces": ["eth0"],
  "topn": 5,
  "ignore_process": ["systemd", "kthreadd"],
  "thresholds": {
    "cpu_percent": { "gt": 85, "times": 3 },
    "mem_percent": { "gt": 80, "times": 3 },
    "disk_percent": { "gt": 80, "times": 1, "mount": "/" },
    "load1": { "gt_per_cpu": 1.5, "times": 2 }
  },
  "prom_textfile": "/tmp/gowatch/gowatch.prom"
}

9.2 运行

9.2.1 持续监控
./gowatch --config ./config.json
root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:27+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":50,"percpu_percent":[80.00000074505806,0,33.33333022892468]},"load":{"load1":0.14,"load5":0.14,"load15":0.19},"mem":{"total":6225928192,"available":1836888064,"used":4060340224,"percent":65.21662471496748,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364460544,"free":16377626624,"percent":70.61892107728262}},"io":{"read_bytes":152716232704,"write_bytes":9487571705856,"read_count":3084159,"write_count":935100404}},"net":{"counters":{"eth0":{"bytes_sent":273623595999,"bytes_recv":546038534609}},"rate":{}},"uptime_sec":8563532,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:27+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:32+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":5.827193577096099,"percpu_percent":[6.43863177954638,4.780876480349941,6.477732835501928]},"load":{"load1":0.13,"load5":0.14,"load15":0.19},"mem":{"total":6225928192,"available":1836658688,"used":4060557312,"percent":65.2201115524848,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364501504,"free":16377585664,"percent":70.61899455856414}},"io":{"read_bytes":152716232704,"write_bytes":9487571894272,"read_count":3084159,"write_count":935100418}},"net":{"counters":{"eth0":{"bytes_sent":273623758861,"bytes_recv":546038714259}},"rate":{"eth0":{"rx_Bps":35938.82178039044,"tx_Bps":32580.397399376274,"dt":4.998772667}}},"uptime_sec":8563537,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:32+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:37+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":7.262945515373922,"percpu_percent":[7.272727242966,7.085020236121907,6.666666650360682]},"load":{"load1":0.12,"load5":0.14,"load15":0.18},"mem":{"total":6225928192,"available":1828683776,"used":4068552704,"percent":65.34853243614153,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364603904,"free":16377483264,"percent":70.61917826176796}},"io":{"read_bytes":152716232704,"write_bytes":9487572082688,"read_count":3084159,"write_count":935100438}},"net":{"counters":{"eth0":{"bytes_sent":273623897867,"bytes_recv":546039385846}},"rate":{"eth0":{"rx_Bps":134264.89941679427,"tx_Bps":27790.333357153886,"dt":5.001955112}}},"uptime_sec":8563542,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:37+08:00"}
^C{"job":"gowatch","level":"INFO","msg":"received stop signal, exiting","ts":"2026-03-21T23:55:38+08:00"}
root@hosthatchhk:~/gowatch#
9.2.2 只采样一次
./gowatch --config ./config.json --once
root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --once
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:54:56+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":50.000001552204274,"percpu_percent":[80,40,0]},"load":{"load1":0.02,"load5":0.12,"load15":0.18},"mem":{"total":6225928192,"available":1822437376,"used":4074803200,"percent":65.44892704088548,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39363870720,"free":16378216448,"percent":70.61786294682865}},"io":{"read_bytes":152716232704,"write_bytes":9487528665088,"read_count":3084159,"write_count":935096172}},"net":{"counters":{"eth0":{"bytes_sent":273622580613,"bytes_recv":546036263097}},"rate":{}},"uptime_sec":8563501,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:54:57+08:00"}
root@hosthatchhk:~/gowatch#
9.2.3 覆盖采样间隔并输出为文本日志
./gowatch --config ./config.json --interval 10 --format text
root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --interval 10 --format text
2026-03-21T23:53:58+08:00 INFO  job=gowatch cpu=50.0% mem=65.2% load1=0.07 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:08+08:00 INFO  job=gowatch cpu=6.4% mem=65.3% load1=0.06 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:18+08:00 INFO  job=gowatch cpu=6.0% mem=65.4% load1=0.05 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:28+08:00 INFO  job=gowatch cpu=11.4% mem=65.5% load1=0.04 disk/=70.6 alert=false job=gowatch
^C2026-03-21T23:54:29+08:00 INFO  received stop signal, exiting job=gowatch
root@hosthatchhk:~/gowatch#
9.2.4 运行 15 秒后退出
./gowatch --config ./config.json --format text --duration 15
root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --format text --duration 15
2026-03-21T23:57:18+08:00 INFO  job=gowatch cpu=45.5% mem=65.0% load1=0.04 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:23+08:00 INFO  job=gowatch cpu=18.3% mem=65.1% load1=0.04 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:28+08:00 INFO  job=gowatch cpu=5.9% mem=65.3% load1=0.03 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:33+08:00 INFO  job=gowatch cpu=4.9% mem=65.3% load1=0.03 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:33+08:00 INFO  duration reached, exiting duration=15 job=gowatch
root@hosthatchhk:~/gowatch#
9.2.5 导出 Prometheus textfile
./gowatch --config ./config.json --prom-textfile /var/lib/node_exporter/textfile/gowatch.prom
root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --prom-textfile /var/lib/node_exporter/textfile/gowatch.prom
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:36+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":36.363637903012474,"percpu_percent":[0,0.0000023283063801601846,100]},"load":{"load1":0.27,"load5":0.14,"load15":0.17},"mem":{"total":6225928192,"available":1828020224,"used":4069199872,"percent":65.35892715930636,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369326592,"free":16372760576,"percent":70.62765065352782}},"io":{"read_bytes":152716445696,"write_bytes":9487915646976,"read_count":3084169,"write_count":935135058}},"net":{"counters":{"eth0":{"bytes_sent":273631942144,"bytes_recv":546057293228}},"rate":{}},"uptime_sec":8563781,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:36+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:41+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":8.19341841829658,"percpu_percent":[7.4898785615917856,7.085020271155843,9.456740420938434]},"load":{"load1":0.25,"load5":0.14,"load15":0.17},"mem":{"total":6225928192,"available":1822982144,"used":4074246144,"percent":65.43997968423726,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369478144,"free":16372609024,"percent":70.62792253426946}},"io":{"read_bytes":152716445696,"write_bytes":9487916007424,"read_count":3084169,"write_count":935135086}},"net":{"counters":{"eth0":{"bytes_sent":273632105941,"bytes_recv":546057754952}},"rate":{"eth0":{"rx_Bps":92323.40111590724,"tx_Bps":32751.808726820043,"dt":5.001158909}}},"uptime_sec":8563786,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:41+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:46+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":7.4747474715515265,"percpu_percent":[8.216432838755008,7.04225349189674,7.723577255538492]},"load":{"load1":0.23,"load5":0.13,"load15":0.17},"mem":{"total":6225928192,"available":1823047680,"used":4074180608,"percent":65.43892705404335,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369527296,"free":16372559872,"percent":70.62801071180729}},"io":{"read_bytes":152716445696,"write_bytes":9487953575936,"read_count":3084169,"write_count":935138866}},"net":{"counters":{"eth0":{"bytes_sent":273632350259,"bytes_recv":546057999243}},"rate":{"eth0":{"rx_Bps":48858.644203249634,"tx_Bps":48864.04425234472,"dt":4.999954542}}},"uptime_sec":8563791,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:46+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:51+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":6.908115366745801,"percpu_percent":[6.2248996036321795,8.016032071234704,6.464646452825576]},"load":{"load1":0.21,"load5":0.13,"load15":0.17},"mem":{"total":6225928192,"available":1820225536,"used":4077002752,"percent":65.4842559417685,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369633792,"free":16372453376,"percent":70.62820176313926}},"io":{"read_bytes":152716445696,"write_bytes":9487954894848,"read_count":3084169,"write_count":935139038}},"net":{"counters":{"eth0":{"bytes_sent":273632486502,"bytes_recv":546058457714}},"rate":{"eth0":{"rx_Bps":91584.09062753986,"tx_Bps":27215.87899642052,"dt":5.006011381}}},"uptime_sec":8563796,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:51+08:00"}
^C{"job":"gowatch","level":"INFO","msg":"received stop signal, exiting","ts":"2026-03-21T23:59:52+08:00"}
root@hosthatchhk:~/gowatch# ls -alh /var/lib/node_exporter/textfile/gowatch.prom
-rw------- 1 root root 988 Mar 21 23:59 /var/lib/node_exporter/textfile/gowatch.prom
root@hosthatchhk:~/gowatch# cat /var/lib/node_exporter/textfile/gowatch.prom
# HELP gowatch_cpu_percent CPU usage percent
# TYPE gowatch_cpu_percent gauge
gowatch_cpu_percent 6.908
# HELP gowatch_mem_percent Memory usage percent
# TYPE gowatch_mem_percent gauge
gowatch_mem_percent 65.484
# HELP gowatch_load1 System load1
# TYPE gowatch_load1 gauge
gowatch_load1 0.210
# HELP gowatch_disk_percent Disk usage percent by mount
# TYPE gowatch_disk_percent gauge
gowatch_disk_percent{mount="/"} 70.628
# HELP gowatch_net_rx_bytes_per_sec Network RX rate bytes/s
# TYPE gowatch_net_rx_bytes_per_sec gauge
# HELP gowatch_net_tx_bytes_per_sec Network TX rate bytes/s
# TYPE gowatch_net_tx_bytes_per_sec gauge
gowatch_net_rx_bytes_per_sec{iface="eth0"} 91584.091
gowatch_net_tx_bytes_per_sec{iface="eth0"} 27215.879
# HELP gowatch_alert_active Alert active (0/1) by type
# TYPE gowatch_alert_active gauge
gowatch_alert_active{type="cpu_percent"} 0
gowatch_alert_active{type="mem_percent"} 0
gowatch_alert_active{type="disk_percent"} 0
gowatch_alert_active{type="load1"} 0
root@hosthatchhk:~/gowatch#


评论