Go 日常巡检CLI工具分享

gowatch

分享一个很久之前学习go做的一个日常巡检工具，如果你也在学习go，作为入门的一个学习案例，还是挺不错的。

源码

package main

import (
  "context"
  "encoding/json"
  "flag"
  "fmt"
  "io"
  "log"
  "math"
  "os"
  "os/signal"
  "path/filepath"
  "sort"
  "strings"
  "sync"
  "syscall"
  "time"

  "github.com/shirou/gopsutil/v3/cpu"
  "github.com/shirou/gopsutil/v3/disk"
  "github.com/shirou/gopsutil/v3/host"
  "github.com/shirou/gopsutil/v3/load"
  "github.com/shirou/gopsutil/v3/mem"
  gnet "github.com/shirou/gopsutil/v3/net"
  "github.com/shirou/gopsutil/v3/process"
)

// 进程退出码，便于外部脚本区分正常结束、告警结束和运行失败。
const (
  ExitOK    = 0
  ExitAlert = 10
  ExitFail  = 20
  ExitLock  = 30
)

// Config 对应配置文件中的整体运行参数。
type Config struct {
  JobName       string                   `json:"jobname"`
  Interval      int                      `json:"interval"`
  Format        string                   `json:"format"`
  LogFile       string                   `json:"log_file"`
  LogLevel      string                   `json:"log_level"`
  LockFile      string                   `json:"lock_file"`
  Mountpoints   []string                 `json:"mountpoints"`
  NetIfaces     []string                 `json:"net_ifaces"`
  TopN          int                      `json:"topn"`
  IgnoreProcess []string                 `json:"ignore_process"`
  Thresholds    map[string]ThresholdRule `json:"thresholds"`
  PromTextfile  string                   `json:"prom_textfile"`
}

// ThresholdRule 是配置文件中的原始阈值定义。
type ThresholdRule struct {
  GT       *float64 `json:"gt"`
  LT       *float64 `json:"lt"`
  Times    int      `json:"times"`
  Mount    string   `json:"mount"`
  GTPerCPU *float64 `json:"gt_per_cpu"`
}

// Threshold 是运行时使用的阈值对象，已经补齐默认值。
type Threshold struct {
  Name     string
  GT       *float64
  LT       *float64
  Times    int
  Mount    string
  GTPerCPU *float64
}

// Alert 表示一次已经满足触发条件的告警结果。
type Alert struct {
  Type   string         `json:"type"`
  Mount  string         `json:"mount,omitempty"`
  Value  float64        `json:"value"`
  Times  int            `json:"times"`
  Reason string         `json:"reason"`
  Rule   map[string]any `json:"rule"`
}

// CPUInfo 汇总 CPU 数量、总利用率和每核利用率。
type CPUInfo struct {
  Count   int       `json:"count"`
  Percent float64   `json:"percent"`
  PerCPU  []float64 `json:"percpu_percent,omitempty"`
}

// LoadInfo 表示系统负载平均值。
type LoadInfo struct {
  Load1  float64 `json:"load1"`
  Load5  float64 `json:"load5"`
  Load15 float64 `json:"load15"`
}

// MemInfo 汇总内存和交换分区使用情况。
type MemInfo struct {
  Total       uint64  `json:"total"`
  Available   uint64  `json:"available"`
  Used        uint64  `json:"used"`
  Percent     float64 `json:"percent"`
  SwapTotal   uint64  `json:"swap_total"`
  SwapUsed    uint64  `json:"swap_used"`
  SwapPercent float64 `json:"swap_percent"`
}

// DiskUsage 记录某个挂载点的空间使用情况。
type DiskUsage struct {
  Total   uint64  `json:"total,omitempty"`
  Used    uint64  `json:"used,omitempty"`
  Free    uint64  `json:"free,omitempty"`
  Percent float64 `json:"percent,omitempty"`
  Error   string  `json:"error,omitempty"`
}

// DiskIO 汇总全局磁盘 IO 计数器。
type DiskIO struct {
  ReadBytes  uint64 `json:"read_bytes"`
  WriteBytes uint64 `json:"write_bytes"`
  ReadCount  uint64 `json:"read_count"`
  WriteCount uint64 `json:"write_count"`
}

// DiskInfo 聚合挂载点空间和全局 IO 信息。
type DiskInfo struct {
  Usage map[string]DiskUsage `json:"usage"`
  IO    DiskIO               `json:"io"`
}

// NetCounter 保存网卡累计收发字节数。
type NetCounter struct {
  BytesSent uint64 `json:"bytes_sent"`
  BytesRecv uint64 `json:"bytes_recv"`
}

// NetRate 保存两次采样之间计算出的网卡瞬时速率。
type NetRate struct {
  RxBps float64 `json:"rx_Bps"`
  TxBps float64 `json:"tx_Bps"`
  Dt    float64 `json:"dt"`
}

// NetInfo 聚合网卡累计值和速率。
type NetInfo struct {
  Counters map[string]NetCounter `json:"counters"`
  Rate     map[string]NetRate    `json:"rate"`
}

// Metrics 是单次采样的完整结果。
type Metrics struct {
  TS        string   `json:"ts"`
  Host      string   `json:"host"`
  CPU       CPUInfo  `json:"cpu"`
  Load      LoadInfo `json:"load"`
  Mem       MemInfo  `json:"mem"`
  Disk      DiskInfo `json:"disk"`
  Net       NetInfo  `json:"net"`
  UptimeSec uint64   `json:"uptime_sec"`
  BootTime  string   `json:"boot_time"`
}

// ProcessInfo 用于输出资源占用最高的进程明细。
type ProcessInfo struct {
  PID        int32   `json:"pid"`
  Name       string  `json:"name"`
  User       string  `json:"user"`
  CPUPercent float64 `json:"cpu_percent"`
  RSS        uint64  `json:"rss"`
  Cmdline    string  `json:"cmdline"`
}

// NetSnapshot 保存上一次网卡采样结果，用于计算速率。
type NetSnapshot struct {
  CapturedAt    time.Time
  CountersByNIC map[string]NetCounter
}

// Collector 负责从 gopsutil 读取系统指标。
type Collector struct {
  mountpoints         []string
  netIfaces           []string
  previousNetSnapshot *NetSnapshot
}

// AlertEngine 根据阈值配置判断本次采样是否触发告警。
type AlertEngine struct {
  thresholds []Threshold
  hitCounts  map[string]int
}

// RotateWriter 在日志文件达到上限时执行轮转。
type RotateWriter struct {
  mu          sync.Mutex
  path        string
  maxBytes    int64
  backupCount int
  currentFile *os.File
}

// AppLogger 封装文本和 JSON 两种日志输出格式。
type AppLogger struct {
  logger *log.Logger
  format string
  debug  bool
}

// FileLock 保证同一 job 只有一个实例运行。
type FileLock struct {
  path     string
  lockFile *os.File
}

func isoNow() string {
  return time.Now().Format(time.RFC3339)
}

func maxInt(left, right int) int {
  if left > right {
    return left
  }
  return right
}

func readJSONFile(path string, target any) error {
  fileContent, err := os.ReadFile(path)
  if err != nil {
    return err
  }
  return json.Unmarshal(fileContent, target)
}

func ensureDir(path string) error {
  if path == "" || path == "." {
    return nil
  }
  return os.MkdirAll(path, 0o755)
}

// atomicWriteText 先写入临时文件再替换目标文件，避免读到半截内容。
func atomicWriteText(path, content string) error {
  dirPath := filepath.Dir(path)
  if err := ensureDir(dirPath); err != nil {
    return err
  }
  tempFile, err := os.CreateTemp(dirPath, ".tmp_*")
  if err != nil {
    return err
  }
  tempFilePath := tempFile.Name()
  defer os.Remove(tempFilePath)

  if _, err := tempFile.WriteString(content); err != nil {
    tempFile.Close()
    return err
  }
  if err := tempFile.Sync(); err != nil {
    tempFile.Close()
    return err
  }
  if err := tempFile.Close(); err != nil {
    return err
  }
  return os.Rename(tempFilePath, path)
}

// newRotateWriter 创建带自动轮转能力的文件写入器。
func newRotateWriter(path string, maxBytes int64, backupCount int) (*RotateWriter, error) {
  rotateWriter := &RotateWriter{path: path, maxBytes: maxBytes, backupCount: backupCount}
  if err := rotateWriter.open(); err != nil {
    return nil, err
  }
  return rotateWriter, nil
}

func (writer *RotateWriter) open() error {
  if err := ensureDir(filepath.Dir(writer.path)); err != nil {
    return err
  }
  logFile, err := os.OpenFile(writer.path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
  if err != nil {
    return err
  }
  writer.currentFile = logFile
  return nil
}

func (writer *RotateWriter) rotate() error {
  if writer.currentFile != nil {
    _ = writer.currentFile.Close()
    writer.currentFile = nil
  }
  // 先倒序移动旧备份，避免覆盖还没有腾挪的位置。
  for backupIndex := writer.backupCount - 1; backupIndex >= 1; backupIndex-- {
    oldPath := fmt.Sprintf("%s.%d", writer.path, backupIndex)
    newPath := fmt.Sprintf("%s.%d", writer.path, backupIndex+1)
    if _, err := os.Stat(oldPath); err == nil {
      _ = os.Rename(oldPath, newPath)
    }
  }
  if _, err := os.Stat(writer.path); err == nil {
    _ = os.Rename(writer.path, writer.path+".1")
  }
  return writer.open()
}

func (writer *RotateWriter) Write(data []byte) (int, error) {
  writer.mu.Lock()
  defer writer.mu.Unlock()

  if writer.currentFile == nil {
    if err := writer.open(); err != nil {
      return 0, err
    }
  }
  fileInfo, err := writer.currentFile.Stat()
  if err == nil && fileInfo.Size()+int64(len(data)) > writer.maxBytes && writer.maxBytes > 0 {
    if err := writer.rotate(); err != nil {
      return 0, err
    }
  }
  return writer.currentFile.Write(data)
}

// newLogger 根据配置组合标准输出和可选的轮转日志文件。
func newLogger(logFile, format string, debug bool) (*AppLogger, error) {
  writers := []io.Writer{os.Stdout}
  if logFile != "" {
    rotateWriter, err := newRotateWriter(logFile, 10*1024*1024, 7)
    if err != nil {
      return nil, err
    }
    writers = append(writers, rotateWriter)
  }
  baseLogger := log.New(io.MultiWriter(writers...), "", 0)
  return &AppLogger{logger: baseLogger, format: format, debug: debug}, nil
}

func (appLogger *AppLogger) log(level, msg string, fields map[string]any) {
  if level == "DEBUG" && !appLogger.debug {
    return
  }
  if fields == nil {
    fields = map[string]any{}
  }
  if appLogger.format == "json" {
    logFields := map[string]any{
      "ts":    isoNow(),
      "level": level,
      "msg":   msg,
    }
    for fieldName, fieldValue := range fields {
      logFields[fieldName] = fieldValue
    }
    jsonPayload, _ := json.Marshal(logFields)
    appLogger.logger.Println(string(jsonPayload))
    return
  }

  var fieldParts []string
  for fieldName, fieldValue := range fields {
    fieldParts = append(fieldParts, fmt.Sprintf("%s=%v", fieldName, fieldValue))
  }
  sort.Strings(fieldParts)
  line := fmt.Sprintf("%s %-5s %s", isoNow(), level, msg)
  if len(fieldParts) > 0 {
    line += " " + strings.Join(fieldParts, " ")
  }
  appLogger.logger.Println(line)
}

func (appLogger *AppLogger) Info(msg string, fields map[string]any) {
  appLogger.log("INFO", msg, fields)
}
func (appLogger *AppLogger) Warn(msg string, fields map[string]any) {
  appLogger.log("WARN", msg, fields)
}
func (appLogger *AppLogger) Error(msg string, fields map[string]any) {
  appLogger.log("ERROR", msg, fields)
}
func (appLogger *AppLogger) Debug(msg string, fields map[string]any) {
  appLogger.log("DEBUG", msg, fields)
}

// Acquire 使用 flock 获取非阻塞独占锁，避免同一任务重复运行。
func (lock *FileLock) Acquire() error {
  if err := ensureDir(filepath.Dir(lock.path)); err != nil {
    return err
  }
  lockHandle, err := os.OpenFile(lock.path, os.O_CREATE|os.O_RDWR, 0o644)
  if err != nil {
    return err
  }
  if err := syscall.Flock(int(lockHandle.Fd()), syscall.LOCK_EX|syscall.LOCK_NB); err != nil {
    _ = lockHandle.Close()
    return err
  }
  if err := lockHandle.Truncate(0); err == nil {
    _, _ = lockHandle.WriteString(fmt.Sprintf("%d\n", os.Getpid()))
  }
  lock.lockFile = lockHandle
  return nil
}

func (lock *FileLock) Release() {
  if lock.lockFile == nil {
    return
  }
  _ = syscall.Flock(int(lock.lockFile.Fd()), syscall.LOCK_UN)
  _ = lock.lockFile.Close()
  lock.lockFile = nil
}

// parseThresholds 把配置中的阈值规则整理成运行时结构，并补齐默认触发次数。
func parseThresholds(config Config) []Threshold {
  var parsedThresholds []Threshold
  for name, rule := range config.Thresholds {
    requiredHits := rule.Times
    if requiredHits <= 0 {
      requiredHits = 1
    }
    parsedThresholds = append(parsedThresholds, Threshold{
      Name:     name,
      GT:       rule.GT,
      LT:       rule.LT,
      Times:    requiredHits,
      Mount:    rule.Mount,
      GTPerCPU: rule.GTPerCPU,
    })
  }
  return parsedThresholds
}

// Check 判断单个指标值是否越过阈值，并返回告警原因文本。
func (threshold Threshold) Check(value float64, cpuCount int) (bool, string) {
  if threshold.Name == "load1" && threshold.GTPerCPU != nil {
    loadLimit := *threshold.GTPerCPU * float64(maxInt(cpuCount, 1))
    return value > loadLimit, fmt.Sprintf("load1 %.2f > %.2f (gt_per_cpu=%.2f)", value, loadLimit, *threshold.GTPerCPU)
  }
  if threshold.GT != nil && value > *threshold.GT {
    return true, fmt.Sprintf("%s %.2f > %.2f", threshold.Name, value, *threshold.GT)
  }
  if threshold.LT != nil && value < *threshold.LT {
    return true, fmt.Sprintf("%s %.2f < %.2f", threshold.Name, value, *threshold.LT)
  }
  return false, ""
}

func newAlertEngine(thresholds []Threshold) *AlertEngine {
  return &AlertEngine{thresholds: thresholds, hitCounts: map[string]int{}}
}

// Evaluate 根据本次采样结果更新连续命中次数，并返回触发的告警列表。
func (engine *AlertEngine) Evaluate(metrics Metrics) (bool, []Alert) {
  var alerts []Alert
  hasAlert := false

  getMetricValue := func(threshold Threshold) (float64, bool) {
    switch threshold.Name {
    case "cpu_percent":
      return metrics.CPU.Percent, true
    case "mem_percent":
      return metrics.Mem.Percent, true
    case "load1":
      return metrics.Load.Load1, true
    case "disk_percent":
      mountpoint := threshold.Mount
      if mountpoint == "" {
        mountpoint = "/"
      }
      diskUsage, ok := metrics.Disk.Usage[mountpoint]
      if !ok || diskUsage.Error != "" {
        return 0, false
      }
      return diskUsage.Percent, true
    default:
      return 0, false
    }
  }

  for _, threshold := range engine.thresholds {
    currentValue, ok := getMetricValue(threshold)
    alertKey := threshold.Name
    if threshold.Mount != "" {
      alertKey += ":" + threshold.Mount
    }
    if !ok {
      engine.hitCounts[alertKey] = 0
      continue
    }
    hit, reason := threshold.Check(currentValue, metrics.CPU.Count)
    if hit {
      engine.hitCounts[alertKey]++
    } else {
      engine.hitCounts[alertKey] = 0
    }
    if hit && engine.hitCounts[alertKey] >= maxInt(threshold.Times, 1) {
      hasAlert = true
      alerts = append(alerts, Alert{
        Type:   threshold.Name,
        Mount:  threshold.Mount,
        Value:  currentValue,
        Times:  engine.hitCounts[alertKey],
        Reason: reason,
        Rule: map[string]any{
          "gt":         threshold.GT,
          "lt":         threshold.LT,
          "times":      threshold.Times,
          "gt_per_cpu": threshold.GTPerCPU,
        },
      })
    }
  }
  return hasAlert, alerts
}

// warmUpCPUAndProcesses 先做一次预热采样，避免第一次百分比统计明显失真。
func warmUpCPUAndProcesses() {
  _, _ = cpu.Percent(0, false)
  _, _ = cpu.Percent(0, true)
  processList, _ := process.Processes()
  for _, proc := range processList {
    _, _ = proc.Percent(0)
  }
}

func newCollector(mountpoints, netIfaces []string) *Collector {
  if len(mountpoints) == 0 {
    mountpoints = []string{"/"}
  }
  return &Collector{mountpoints: mountpoints, netIfaces: netIfaces}
}

// Collect 执行一次完整采样，按 CPU、内存、磁盘、网络等维度组装结果。
func (collector *Collector) Collect() (Metrics, error) {
  var metrics Metrics
  metrics.TS = isoNow()

  hostInfo, err := host.Info()
  if err != nil {
    return metrics, err
  }
  metrics.Host = hostInfo.Hostname
  metrics.UptimeSec = hostInfo.Uptime
  metrics.BootTime = time.Unix(int64(hostInfo.BootTime), 0).Format(time.RFC3339)

  totalCPUPercentages, err := cpu.Percent(0, false)
  if err != nil {
    return metrics, err
  }
  perCPUPercentages, err := cpu.Percent(0, true)
  if err != nil {
    return metrics, err
  }
  cpuCount, _ := cpu.Counts(true)
  totalCPUPercent := 0.0
  if len(totalCPUPercentages) > 0 {
    totalCPUPercent = totalCPUPercentages[0]
  }
  metrics.CPU = CPUInfo{Count: maxInt(cpuCount, 1), Percent: totalCPUPercent, PerCPU: perCPUPercentages}

  loadAverage, err := load.Avg()
  if err == nil {
    metrics.Load = LoadInfo{Load1: loadAverage.Load1, Load5: loadAverage.Load5, Load15: loadAverage.Load15}
  }

  virtualMemoryStat, err := mem.VirtualMemory()
  if err != nil {
    return metrics, err
  }
  swapMemoryStat, _ := mem.SwapMemory()
  metrics.Mem = MemInfo{
    Total:       virtualMemoryStat.Total,
    Available:   virtualMemoryStat.Available,
    Used:        virtualMemoryStat.Used,
    Percent:     virtualMemoryStat.UsedPercent,
    SwapTotal:   swapMemoryStat.Total,
    SwapUsed:    swapMemoryStat.Used,
    SwapPercent: swapMemoryStat.UsedPercent,
  }

  diskUsageByMount := map[string]DiskUsage{}
  for _, mountpoint := range collector.mountpoints {
    diskUsageStat, err := disk.Usage(mountpoint)
    if err != nil {
      diskUsageByMount[mountpoint] = DiskUsage{Error: err.Error()}
      continue
    }
    diskUsageByMount[mountpoint] = DiskUsage{
      Total:   diskUsageStat.Total,
      Used:    diskUsageStat.Used,
      Free:    diskUsageStat.Free,
      Percent: diskUsageStat.UsedPercent,
    }
  }

  diskIOCounters, _ := disk.IOCounters()
  diskIO := DiskIO{}
  for _, diskCounter := range diskIOCounters {
    diskIO.ReadBytes += diskCounter.ReadBytes
    diskIO.WriteBytes += diskCounter.WriteBytes
    diskIO.ReadCount += diskCounter.ReadCount
    diskIO.WriteCount += diskCounter.WriteCount
  }
  metrics.Disk = DiskInfo{Usage: diskUsageByMount, IO: diskIO}

  nicCounters, err := gnet.IOCounters(true)
  if err != nil {
    return metrics, err
  }
  selectedInterfaces := map[string]bool{}
  if len(collector.netIfaces) > 0 {
    for _, iface := range collector.netIfaces {
      selectedInterfaces[iface] = true
    }
  }

  currentNetCounters := map[string]NetCounter{}
  for _, nicCounter := range nicCounters {
    if len(selectedInterfaces) > 0 && !selectedInterfaces[nicCounter.Name] {
      continue
    }
    currentNetCounters[nicCounter.Name] = NetCounter{
      BytesSent: nicCounter.BytesSent,
      BytesRecv: nicCounter.BytesRecv,
    }
  }
  netRates := map[string]NetRate{}
  now := time.Now()
  if collector.previousNetSnapshot != nil {
    elapsedSeconds := now.Sub(collector.previousNetSnapshot.CapturedAt).Seconds()
    if elapsedSeconds < 1e-6 {
      elapsedSeconds = 1e-6
    }
    // 网卡速率由两次累计值相减后除以采样间隔得到。
    for iface, currentCounter := range currentNetCounters {
      previousCounter, ok := collector.previousNetSnapshot.CountersByNIC[iface]
      if !ok {
        continue
      }
      rxBps := float64(currentCounter.BytesRecv-previousCounter.BytesRecv) / elapsedSeconds
      txBps := float64(currentCounter.BytesSent-previousCounter.BytesSent) / elapsedSeconds
      netRates[iface] = NetRate{RxBps: rxBps, TxBps: txBps, Dt: elapsedSeconds}
    }
  }
  collector.previousNetSnapshot = &NetSnapshot{CapturedAt: now, CountersByNIC: currentNetCounters}
  metrics.Net = NetInfo{Counters: currentNetCounters, Rate: netRates}

  return metrics, nil
}

// topNProcesses 返回 CPU 和内存占用最高的前 N 个进程。
func topNProcesses(topN int, ignoreNames []string) map[string][]ProcessInfo {
  ignoredProcessNames := map[string]bool{}
  for _, processName := range ignoreNames {
    ignoredProcessNames[processName] = true
  }

  processList, err := process.Processes()
  if err != nil {
    return map[string][]ProcessInfo{"cpu": {}, "mem": {}}
  }

  processInfos := make([]ProcessInfo, 0, len(processList))
  for _, proc := range processList {
    processName, err := proc.Name()
    if err != nil || ignoredProcessNames[processName] {
      continue
    }
    username, _ := proc.Username()
    commandLine, _ := proc.Cmdline()
    memoryInfo, err := proc.MemoryInfo()
    if err != nil {
      continue
    }
    cpuPercent, _ := proc.Percent(0)
    processInfos = append(processInfos, ProcessInfo{
      PID:        proc.Pid,
      Name:       processName,
      User:       username,
      CPUPercent: cpuPercent,
      RSS:        memoryInfo.RSS,
      Cmdline:    truncate(commandLine, 500),
    })
  }

  processesByCPU := append([]ProcessInfo(nil), processInfos...)
  sort.Slice(processesByCPU, func(i, j int) bool { return processesByCPU[i].CPUPercent > processesByCPU[j].CPUPercent })
  processesByMemory := append([]ProcessInfo(nil), processInfos...)
  sort.Slice(processesByMemory, func(i, j int) bool { return processesByMemory[i].RSS > processesByMemory[j].RSS })

  if topN < 0 {
    topN = 0
  }
  if len(processesByCPU) > topN {
    processesByCPU = processesByCPU[:topN]
  }
  if len(processesByMemory) > topN {
    processesByMemory = processesByMemory[:topN]
  }
  return map[string][]ProcessInfo{"cpu": processesByCPU, "mem": processesByMemory}
}

func truncate(value string, maxLength int) string {
  if len(value) <= maxLength {
    return value
  }
  return value[:maxLength]
}

// promEscape 对 label 值做最小转义，避免生成非法的 Prometheus 文本格式。
func promEscape(value string) string {
  value = strings.ReplaceAll(value, `\`, `\\`)
  value = strings.ReplaceAll(value, "\n", `\n`)
  value = strings.ReplaceAll(value, `"`, `\"`)
  return value
}

// buildPromText 把采样结果和告警状态导出为 Prometheus textfile 格式。
func buildPromText(metrics Metrics, alerts []Alert) string {
  var promBuilder strings.Builder
  fmt.Fprintf(&promBuilder, "# HELP gowatch_cpu_percent CPU usage percent\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_cpu_percent gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_cpu_percent %.3f\n", metrics.CPU.Percent)

  fmt.Fprintf(&promBuilder, "# HELP gowatch_mem_percent Memory usage percent\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_mem_percent gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_mem_percent %.3f\n", metrics.Mem.Percent)

  fmt.Fprintf(&promBuilder, "# HELP gowatch_load1 System load1\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_load1 gauge\n")
  fmt.Fprintf(&promBuilder, "gowatch_load1 %.3f\n", metrics.Load.Load1)

  fmt.Fprintf(&promBuilder, "# HELP gowatch_disk_percent Disk usage percent by mount\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_disk_percent gauge\n")
  mountpoints := make([]string, 0, len(metrics.Disk.Usage))
  for mountpoint := range metrics.Disk.Usage {
    mountpoints = append(mountpoints, mountpoint)
  }
  sort.Strings(mountpoints)
  for _, mountpoint := range mountpoints {
    diskUsage := metrics.Disk.Usage[mountpoint]
    if diskUsage.Error != "" {
      continue
    }
    fmt.Fprintf(&promBuilder, "gowatch_disk_percent{mount=\"%s\"} %.3f\n", promEscape(mountpoint), diskUsage.Percent)
  }

  fmt.Fprintf(&promBuilder, "# HELP gowatch_net_rx_bytes_per_sec Network RX rate bytes/s\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_net_rx_bytes_per_sec gauge\n")
  fmt.Fprintf(&promBuilder, "# HELP gowatch_net_tx_bytes_per_sec Network TX rate bytes/s\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_net_tx_bytes_per_sec gauge\n")
  interfaceNames := make([]string, 0, len(metrics.Net.Rate))
  for interfaceName := range metrics.Net.Rate {
    interfaceNames = append(interfaceNames, interfaceName)
  }
  sort.Strings(interfaceNames)
  for _, interfaceName := range interfaceNames {
    netRate := metrics.Net.Rate[interfaceName]
    fmt.Fprintf(&promBuilder, "gowatch_net_rx_bytes_per_sec{iface=\"%s\"} %.3f\n", promEscape(interfaceName), netRate.RxBps)
    fmt.Fprintf(&promBuilder, "gowatch_net_tx_bytes_per_sec{iface=\"%s\"} %.3f\n", promEscape(interfaceName), netRate.TxBps)
  }

  activeAlerts := map[string]bool{}
  for _, alert := range alerts {
    activeAlerts[alert.Type] = true
  }
  knownAlertTypes := []string{"cpu_percent", "mem_percent", "disk_percent", "load1"}
  fmt.Fprintf(&promBuilder, "# HELP gowatch_alert_active Alert active (0/1) by type\n")
  fmt.Fprintf(&promBuilder, "# TYPE gowatch_alert_active gauge\n")
  for _, alertType := range knownAlertTypes {
    activeValue := 0
    if activeAlerts[alertType] {
      activeValue = 1
    }
    fmt.Fprintf(&promBuilder, "gowatch_alert_active{type=\"%s\"} %d\n", promEscape(alertType), activeValue)
  }
  return promBuilder.String()
}

// sampleSummary 生成简洁的人类可读采样摘要。
func sampleSummary(jobName string, metrics Metrics, hasAlert bool) string {
  diskRoot := "n/a"
  if rootDiskUsage, ok := metrics.Disk.Usage["/"]; ok && rootDiskUsage.Error == "" {
    diskRoot = fmt.Sprintf("%.1f", rootDiskUsage.Percent)
  }
  return fmt.Sprintf("job=%s cpu=%.1f%% mem=%.1f%% load1=%.2f disk/=%s alert=%v",
    jobName, metrics.CPU.Percent, metrics.Mem.Percent, metrics.Load.Load1, diskRoot, hasAlert)
}

// configValueInt 按命令行、配置文件、默认值的顺序选择最终数值参数。
func configValueInt(flagValue, configValue, defaultValue int) int {
  if flagValue >= 0 {
    return flagValue
  }
  if configValue > 0 {
    return configValue
  }
  return defaultValue
}

// configValueString 按命令行、配置文件、默认值的顺序选择最终字符串参数。
func configValueString(flagValue, configValue, defaultValue string) string {
  if flagValue != "" {
    return flagValue
  }
  if configValue != "" {
    return configValue
  }
  return defaultValue
}

// run 是主执行循环：采样、判定告警、记录日志，并处理退出条件。
func run(ctx context.Context, config Config, once bool, intervalSec, durationSec int, format, promTextfilePath string, debug bool) int {
  jobName := config.JobName
  if jobName == "" {
    jobName = "gowatch"
  }
  intervalSec = maxInt(intervalSec, 1)
  if config.TopN <= 0 {
    config.TopN = 5
  }

  logger, err := newLogger(config.LogFile, format, debug || strings.EqualFold(config.LogLevel, "DEBUG"))
  if err != nil {
    fmt.Fprintf(os.Stderr, "init logger failed: %v\n", err)
    return ExitFail
  }

  lockPath := config.LockFile
  if lockPath == "" {
    lockPath = fmt.Sprintf("/tmp/gowatch_%s.lock", jobName)
  }
  fileLock := &FileLock{path: lockPath}
  if err := fileLock.Acquire(); err != nil {
    logger.Warn("lock not acquired, another instance may be running", map[string]any{"job": jobName, "lock": lockPath, "err": err.Error()})
    return ExitLock
  }
  defer fileLock.Release()

  warmUpCPUAndProcesses()

  collector := newCollector(config.Mountpoints, config.NetIfaces)
  engine := newAlertEngine(parseThresholds(config))
  startedAt := time.Now()
  anyAlert := false

  for {
    loopStartedAt := time.Now()
    metrics, err := collector.Collect()
    if err != nil {
      logger.Error("collect failed", map[string]any{"job": jobName, "err": err.Error()})
      return ExitFail
    }

    hasAlert, alerts := engine.Evaluate(metrics)
    anyAlert = anyAlert || hasAlert

    if format == "json" {
      logger.Info("sample", map[string]any{"job": jobName, "metrics": metrics, "alert": hasAlert, "alerts": alerts})
    } else {
      logger.Info(sampleSummary(jobName, metrics, hasAlert), map[string]any{"job": jobName})
    }

    if hasAlert {
      topProcesses := topNProcesses(config.TopN, config.IgnoreProcess)
      if format == "json" {
        logger.Warn("alert_detail", map[string]any{"job": jobName, "alerts": alerts, "topn": topProcesses})
      } else {
        logger.Warn("ALERT", map[string]any{"job": jobName, "alerts": alerts})
        logger.Warn("TopCPU", map[string]any{"job": jobName, "procs": topProcesses["cpu"]})
        logger.Warn("TopMEM", map[string]any{"job": jobName, "procs": topProcesses["mem"]})
      }
    }

    if promTextfilePath != "" {
      if err := atomicWriteText(promTextfilePath, buildPromText(metrics, alerts)); err != nil {
        logger.Error("prom textfile write failed", map[string]any{"job": jobName, "err": err.Error(), "path": promTextfilePath})
      }
    }

    if once {
      if hasAlert {
        return ExitAlert
      }
      return ExitOK
    }

    if durationSec > 0 && time.Since(startedAt) >= time.Duration(durationSec)*time.Second {
      logger.Info("duration reached, exiting", map[string]any{"job": jobName, "duration": durationSec})
      break
    }

    collectDuration := time.Since(loopStartedAt)
    if collectDuration > time.Duration(intervalSec)*time.Second {
      logger.Warn("collector_overrun", map[string]any{
        "job":          jobName,
        "spent_sec":    math.Round(collectDuration.Seconds()*1000) / 1000,
        "interval_sec": intervalSec,
      })
    } else {
      // 未超时则等待到下一个采样周期，同时继续监听退出信号。
      select {
      case <-ctx.Done():
        logger.Info("received stop signal, exiting", map[string]any{"job": jobName})
        if anyAlert {
          return ExitAlert
        }
        return ExitOK
      case <-time.After(time.Duration(intervalSec)*time.Second - collectDuration):
      }
    }

    select {
    case <-ctx.Done():
      logger.Info("received stop signal, exiting", map[string]any{"job": jobName})
      if anyAlert {
        return ExitAlert
      }
      return ExitOK
    default:
    }
  }

  if anyAlert {
    return ExitAlert
  }
  return ExitOK
}

// main 负责解析参数、加载配置，并接入系统信号控制主循环退出。
func main() {
  var (
    configPath   = flag.String("config", "", "config.json path")
    once         = flag.Bool("once", false, "collect once and exit")
    interval     = flag.Int("interval", -1, "override interval seconds")
    duration     = flag.Int("duration", -1, "run duration seconds")
    format       = flag.String("format", "", "log format: json|text")
    promTextfile = flag.String("prom-textfile", "", "prometheus textfile output path")
    debug        = flag.Bool("debug", false, "enable debug logging")
  )
  flag.Parse()

  if *configPath == "" {
    fmt.Fprintln(os.Stderr, "--config is required")
    os.Exit(ExitFail)
  }

  var config Config
  if err := readJSONFile(*configPath, &config); err != nil {
    fmt.Fprintf(os.Stderr, "read config failed: %v\n", err)
    os.Exit(ExitFail)
  }

  finalFormat := configValueString(*format, config.Format, "json")
  finalPromPath := configValueString(*promTextfile, config.PromTextfile, "")
  finalInterval := configValueInt(*interval, config.Interval, 5)
  finalDuration := *duration
  if finalDuration < 0 {
    finalDuration = 0
  }

  ctx, cancel := context.WithCancel(context.Background())
  defer cancel()

  signalCh := make(chan os.Signal, 1)
  signalNotify(signalCh)
  go func() {
    <-signalCh
    cancel()
  }()

  exitCode := run(ctx, config, *once, finalInterval, finalDuration, finalFormat, finalPromPath, *debug)
  os.Exit(exitCode)
}

// signalNotify 统一注册常见的终止信号。
func signalNotify(signalCh chan<- os.Signal) {
  signal.Notify(signalCh, syscall.SIGINT, syscall.SIGTERM)
}

1. 这个工具能做什么

周期性采集主机运行状态，包括 CPU、Load、内存、Swap、磁盘、网络、开机时间和运行时长。
支持一次性采样，也支持按固定间隔持续运行。
支持配置运行时长，到达指定时间后自动退出。
支持按挂载点采集磁盘使用率，默认至少会关注根分区 /。
支持按网卡白名单采集网络流量，并计算每秒收发速率。
支持基于阈值做告警判断，当前支持 cpu_percent、mem_percent、disk_percent、load1。
支持连续命中次数控制，避免单次抖动直接触发告警。
支持按 CPU 核数计算 load1 阈值，适合多核机器。
告警触发时会额外输出 CPU 和内存占用最高的进程列表，方便定位问题。
支持忽略指定进程名，避免监控结果被无关进程干扰。
支持 json 和 text 两种日志格式。
支持同时输出到标准输出和日志文件。
如果配置了日志文件，内置日志轮转，单文件达到 10 MB 后自动切分，默认保留 7 份备份。
支持导出 Prometheus textfile，可直接配合 Node Exporter 的 textfile collector 使用。
通过文件锁保证同一个 job 只运行一个实例，避免重复采样。
支持响应 SIGINT 和 SIGTERM，适合被 systemd、crontab 或脚本调度。

2. 适合的使用场景

给一台机器做轻量的本地巡检
被 crontab 或 systemd 定时拉起
作为批处理任务前后的健康检查
把系统关键指标以 JSON 打到日志平台
把告警状态和资源指标暴露给 Prometheus

3. 采集内容

CPU: 总使用率、每核使用率、CPU 核数
Load: load1、load5、load15
内存: 总量、可用量、已用量、使用率
Swap: 总量、已用量、使用率
磁盘: 每个挂载点的总量、已用量、剩余量、使用率
磁盘 IO: 全局读写字节数、读写次数
网络: 每块网卡的累计收发字节数、每秒收发速率
主机信息: 主机名、开机时间、运行时长、采样时间

4. 告警能力

支持的阈值类型如下：

cpu_percent: CPU 总使用率
mem_percent: 内存使用率
disk_percent: 指定挂载点的磁盘使用率
load1: 1 分钟平均负载

每条阈值规则支持这些能力：

gt: 大于该值时触发
lt: 小于该值时触发
times: 连续命中多少次后才真正告警
mount: 仅用于 disk_percent，指定要检查的挂载点
gt_per_cpu: 仅用于 load1，按 CPU 核数 x 阈值 计算上限

告警触发后：

单次采样会返回退出码 10
持续运行模式下会记录告警详情
会输出 Top N CPU 进程和 Top N 内存进程

5. 输出形式

5.1 文本日志

适合直接在终端查看，输出类似：

2026-03-21T12:00:00+08:00 INFO  job=gowatch cpu=13.7% mem=61.2% load1=1.25 disk/=72.4 alert=false

5.2 JSON 日志

适合被日志系统或脚本消费，采样结果和告警详情都会以 JSON 输出。

5.3 Prometheus textfile

如果指定 --prom-textfile 或配置项 prom_textfile，会输出这些指标：

gowatch_cpu_percent
gowatch_mem_percent
gowatch_load1
gowatch_disk_percent
gowatch_net_rx_bytes_per_sec
gowatch_net_tx_bytes_per_sec
gowatch_alert_active

6. 命令行参数

参数	说明
`--config`	配置文件路径，必填
`--once`	只采样一次后退出
`--interval`	覆盖配置文件中的采样间隔，单位秒
`--duration`	运行总时长，单位秒；到时自动退出
`--format`	日志格式，支持 `json` 或 `text`
`--prom-textfile`	Prometheus textfile 输出路径
`--debug`	打开调试日志

7. 配置项说明

配置项	说明
`jobname`	任务名称，会出现在日志和锁文件名中
`interval`	采样间隔，默认值为 5 秒
`format`	输出格式，默认值为 `json`
`log_file`	日志文件路径；配置后会同时写终端和文件
`log_level`	当前主要用于开启 `DEBUG` 级别日志
`lock_file`	文件锁路径；未配置时会自动落到 `/tmp`
`mountpoints`	需要采集的挂载点列表
`net_ifaces`	需要采集的网卡列表；为空时采集全部网卡
`topn`	告警时输出多少个高占用进程，默认值为 5
`ignore_process`	告警时需要忽略的进程名列表
`thresholds`	告警规则集合
`prom_textfile`	Prometheus textfile 输出路径

8. 退出码

退出码	含义
`0`	正常结束，没有告警
`10`	运行过程中出现过告警，或单次采样命中告警
`20`	采集、配置读取或初始化失败
`30`	没拿到文件锁，通常表示已有同名任务在运行

9. 快速开始

9.1 构建

go build -o gowatch .

config.json 示例配置

{
  "jobname": "gowatch",
  "interval": 5,
  "format": "json",
  "log_file": "/tmp/gowatch/gowatch.log",
  "log_level": "INFO",
  "lock_file": "/tmp/gowatch/gowatch.lock",
  "mountpoints": ["/"],
  "net_ifaces": ["eth0"],
  "topn": 5,
  "ignore_process": ["systemd", "kthreadd"],
  "thresholds": {
    "cpu_percent": { "gt": 85, "times": 3 },
    "mem_percent": { "gt": 80, "times": 3 },
    "disk_percent": { "gt": 80, "times": 1, "mount": "/" },
    "load1": { "gt_per_cpu": 1.5, "times": 2 }
  },
  "prom_textfile": "/tmp/gowatch/gowatch.prom"
}

9.2 运行

9.2.1 持续监控

./gowatch --config ./config.json

root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:27+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":50,"percpu_percent":[80.00000074505806,0,33.33333022892468]},"load":{"load1":0.14,"load5":0.14,"load15":0.19},"mem":{"total":6225928192,"available":1836888064,"used":4060340224,"percent":65.21662471496748,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364460544,"free":16377626624,"percent":70.61892107728262}},"io":{"read_bytes":152716232704,"write_bytes":9487571705856,"read_count":3084159,"write_count":935100404}},"net":{"counters":{"eth0":{"bytes_sent":273623595999,"bytes_recv":546038534609}},"rate":{}},"uptime_sec":8563532,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:27+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:32+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":5.827193577096099,"percpu_percent":[6.43863177954638,4.780876480349941,6.477732835501928]},"load":{"load1":0.13,"load5":0.14,"load15":0.19},"mem":{"total":6225928192,"available":1836658688,"used":4060557312,"percent":65.2201115524848,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364501504,"free":16377585664,"percent":70.61899455856414}},"io":{"read_bytes":152716232704,"write_bytes":9487571894272,"read_count":3084159,"write_count":935100418}},"net":{"counters":{"eth0":{"bytes_sent":273623758861,"bytes_recv":546038714259}},"rate":{"eth0":{"rx_Bps":35938.82178039044,"tx_Bps":32580.397399376274,"dt":4.998772667}}},"uptime_sec":8563537,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:32+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:55:37+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":7.262945515373922,"percpu_percent":[7.272727242966,7.085020236121907,6.666666650360682]},"load":{"load1":0.12,"load5":0.14,"load15":0.18},"mem":{"total":6225928192,"available":1828683776,"used":4068552704,"percent":65.34853243614153,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39364603904,"free":16377483264,"percent":70.61917826176796}},"io":{"read_bytes":152716232704,"write_bytes":9487572082688,"read_count":3084159,"write_count":935100438}},"net":{"counters":{"eth0":{"bytes_sent":273623897867,"bytes_recv":546039385846}},"rate":{"eth0":{"rx_Bps":134264.89941679427,"tx_Bps":27790.333357153886,"dt":5.001955112}}},"uptime_sec":8563542,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:55:37+08:00"}
^C{"job":"gowatch","level":"INFO","msg":"received stop signal, exiting","ts":"2026-03-21T23:55:38+08:00"}
root@hosthatchhk:~/gowatch#

9.2.2 只采样一次

./gowatch --config ./config.json --once

root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --once
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:54:56+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":50.000001552204274,"percpu_percent":[80,40,0]},"load":{"load1":0.02,"load5":0.12,"load15":0.18},"mem":{"total":6225928192,"available":1822437376,"used":4074803200,"percent":65.44892704088548,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39363870720,"free":16378216448,"percent":70.61786294682865}},"io":{"read_bytes":152716232704,"write_bytes":9487528665088,"read_count":3084159,"write_count":935096172}},"net":{"counters":{"eth0":{"bytes_sent":273622580613,"bytes_recv":546036263097}},"rate":{}},"uptime_sec":8563501,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:54:57+08:00"}
root@hosthatchhk:~/gowatch#

9.2.3 覆盖采样间隔并输出为文本日志

./gowatch --config ./config.json --interval 10 --format text

root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --interval 10 --format text
2026-03-21T23:53:58+08:00 INFO  job=gowatch cpu=50.0% mem=65.2% load1=0.07 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:08+08:00 INFO  job=gowatch cpu=6.4% mem=65.3% load1=0.06 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:18+08:00 INFO  job=gowatch cpu=6.0% mem=65.4% load1=0.05 disk/=70.6 alert=false job=gowatch
2026-03-21T23:54:28+08:00 INFO  job=gowatch cpu=11.4% mem=65.5% load1=0.04 disk/=70.6 alert=false job=gowatch
^C2026-03-21T23:54:29+08:00 INFO  received stop signal, exiting job=gowatch
root@hosthatchhk:~/gowatch#

9.2.4 运行 15 秒后退出

./gowatch --config ./config.json --format text --duration 15

root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --format text --duration 15
2026-03-21T23:57:18+08:00 INFO  job=gowatch cpu=45.5% mem=65.0% load1=0.04 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:23+08:00 INFO  job=gowatch cpu=18.3% mem=65.1% load1=0.04 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:28+08:00 INFO  job=gowatch cpu=5.9% mem=65.3% load1=0.03 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:33+08:00 INFO  job=gowatch cpu=4.9% mem=65.3% load1=0.03 disk/=70.6 alert=false job=gowatch
2026-03-21T23:57:33+08:00 INFO  duration reached, exiting duration=15 job=gowatch
root@hosthatchhk:~/gowatch#

9.2.5 导出 Prometheus textfile

./gowatch --config ./config.json --prom-textfile /var/lib/node_exporter/textfile/gowatch.prom

root@hosthatchhk:~/gowatch# ./gowatch --config ./config.json --prom-textfile /var/lib/node_exporter/textfile/gowatch.prom
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:36+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":36.363637903012474,"percpu_percent":[0,0.0000023283063801601846,100]},"load":{"load1":0.27,"load5":0.14,"load15":0.17},"mem":{"total":6225928192,"available":1828020224,"used":4069199872,"percent":65.35892715930636,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369326592,"free":16372760576,"percent":70.62765065352782}},"io":{"read_bytes":152716445696,"write_bytes":9487915646976,"read_count":3084169,"write_count":935135058}},"net":{"counters":{"eth0":{"bytes_sent":273631942144,"bytes_recv":546057293228}},"rate":{}},"uptime_sec":8563781,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:36+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:41+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":8.19341841829658,"percpu_percent":[7.4898785615917856,7.085020271155843,9.456740420938434]},"load":{"load1":0.25,"load5":0.14,"load15":0.17},"mem":{"total":6225928192,"available":1822982144,"used":4074246144,"percent":65.43997968423726,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369478144,"free":16372609024,"percent":70.62792253426946}},"io":{"read_bytes":152716445696,"write_bytes":9487916007424,"read_count":3084169,"write_count":935135086}},"net":{"counters":{"eth0":{"bytes_sent":273632105941,"bytes_recv":546057754952}},"rate":{"eth0":{"rx_Bps":92323.40111590724,"tx_Bps":32751.808726820043,"dt":5.001158909}}},"uptime_sec":8563786,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:41+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:46+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":7.4747474715515265,"percpu_percent":[8.216432838755008,7.04225349189674,7.723577255538492]},"load":{"load1":0.23,"load5":0.13,"load15":0.17},"mem":{"total":6225928192,"available":1823047680,"used":4074180608,"percent":65.43892705404335,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369527296,"free":16372559872,"percent":70.62801071180729}},"io":{"read_bytes":152716445696,"write_bytes":9487953575936,"read_count":3084169,"write_count":935138866}},"net":{"counters":{"eth0":{"bytes_sent":273632350259,"bytes_recv":546057999243}},"rate":{"eth0":{"rx_Bps":48858.644203249634,"tx_Bps":48864.04425234472,"dt":4.999954542}}},"uptime_sec":8563791,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:46+08:00"}
{"alert":false,"alerts":null,"job":"gowatch","level":"INFO","metrics":{"ts":"2026-03-21T23:59:51+08:00","host":"hosthatchhk","cpu":{"count":3,"percent":6.908115366745801,"percpu_percent":[6.2248996036321795,8.016032071234704,6.464646452825576]},"load":{"load1":0.21,"load5":0.13,"load15":0.17},"mem":{"total":6225928192,"available":1820225536,"used":4077002752,"percent":65.4842559417685,"swap_total":0,"swap_used":0,"swap_percent":0},"disk":{"usage":{"/":{"total":58760200192,"used":39369633792,"free":16372453376,"percent":70.62820176313926}},"io":{"read_bytes":152716445696,"write_bytes":9487954894848,"read_count":3084169,"write_count":935139038}},"net":{"counters":{"eth0":{"bytes_sent":273632486502,"bytes_recv":546058457714}},"rate":{"eth0":{"rx_Bps":91584.09062753986,"tx_Bps":27215.87899642052,"dt":5.006011381}}},"uptime_sec":8563796,"boot_time":"2025-12-12T21:09:56+08:00"},"msg":"sample","ts":"2026-03-21T23:59:51+08:00"}
^C{"job":"gowatch","level":"INFO","msg":"received stop signal, exiting","ts":"2026-03-21T23:59:52+08:00"}
root@hosthatchhk:~/gowatch# ls -alh /var/lib/node_exporter/textfile/gowatch.prom
-rw------- 1 root root 988 Mar 21 23:59 /var/lib/node_exporter/textfile/gowatch.prom
root@hosthatchhk:~/gowatch# cat /var/lib/node_exporter/textfile/gowatch.prom
# HELP gowatch_cpu_percent CPU usage percent
# TYPE gowatch_cpu_percent gauge
gowatch_cpu_percent 6.908
# HELP gowatch_mem_percent Memory usage percent
# TYPE gowatch_mem_percent gauge
gowatch_mem_percent 65.484
# HELP gowatch_load1 System load1
# TYPE gowatch_load1 gauge
gowatch_load1 0.210
# HELP gowatch_disk_percent Disk usage percent by mount
# TYPE gowatch_disk_percent gauge
gowatch_disk_percent{mount="/"} 70.628
# HELP gowatch_net_rx_bytes_per_sec Network RX rate bytes/s
# TYPE gowatch_net_rx_bytes_per_sec gauge
# HELP gowatch_net_tx_bytes_per_sec Network TX rate bytes/s
# TYPE gowatch_net_tx_bytes_per_sec gauge
gowatch_net_rx_bytes_per_sec{iface="eth0"} 91584.091
gowatch_net_tx_bytes_per_sec{iface="eth0"} 27215.879
# HELP gowatch_alert_active Alert active (0/1) by type
# TYPE gowatch_alert_active gauge
gowatch_alert_active{type="cpu_percent"} 0
gowatch_alert_active{type="mem_percent"} 0
gowatch_alert_active{type="disk_percent"} 0
gowatch_alert_active{type="load1"} 0
root@hosthatchhk:~/gowatch#

菜单

分享

Go 日常巡检CLI工具分享

gowatch

源码

1. 这个工具能做什么

2. 适合的使用场景

3. 采集内容

4. 告警能力

5. 输出形式

5.1 文本日志

5.2 JSON 日志

5.3 Prometheus textfile

6. 命令行参数

7. 配置项说明

8. 退出码

9. 快速开始

9.1 构建

9.2 运行

9.2.1 持续监控

9.2.2 只采样一次

9.2.3 覆盖采样间隔并输出为文本日志

9.2.4 运行 15 秒后退出

9.2.5 导出 Prometheus textfile

评论