2023-11-08

TCP连接池优化心得

TCP 连接池优化心得

在高并发系统中，TCP 连接管理直接影响系统性能。我在多个项目中都遇到过连接池相关的性能瓶颈，通过不断优化总结出一些实用的经验。

连接池参数调优

核心参数设置

type PoolConfig struct {
    MaxOpenConns    int           // 最大连接数
    MaxIdleConns    int           // 最大空闲连接数
    ConnMaxLifetime time.Duration // 连接最大生存时间
    ConnMaxIdleTime time.Duration // 连接最大空闲时间
}

// 生产环境推荐配置
config := &PoolConfig{
    MaxOpenConns:    100,  // 根据下游服务能力设置
    MaxIdleConns:    20,   // 通常设置为MaxOpenConns的20-30%
    ConnMaxLifetime: 30 * time.Minute, // 避免长连接问题
    ConnMaxIdleTime: 5 * time.Minute,  // 及时释放空闲连接
}

数据库连接池优化

# MySQL连接池配置
datasource:
  hikari:
    maximum-pool-size: 50
    minimum-idle: 10
    idle-timeout: 300000 # 5分钟
    max-lifetime: 1800000 # 30分钟
    connection-timeout: 5000 # 5秒
    leak-detection-threshold: 60000 # 1分钟泄漏检测

实战经验：某订单服务在高峰期出现大量超时，通过监控发现连接池耗尽。分析后发现 MaxIdleConns 设置过小(5 个)，导致频繁创建/销毁连接。调整为 20 个后，P99 延迟从 2 秒降到 200ms。

连接池监控指标

关键指标定义

type PoolStats struct {
    OpenConnections int // 当前活跃连接数
    InUse          int // 正在使用的连接数
    Idle           int // 空闲连接数
    WaitCount      int // 等待连接的请求数
    WaitDuration   time.Duration // 平均等待时长
}

// 监控函数
func (p *Pool) GetStats() *PoolStats {
    return &PoolStats{
        OpenConnections: p.openConns,
        InUse:          p.inUse,
        Idle:           p.idle,
        WaitCount:      p.waitCount,
        WaitDuration:   p.avgWaitTime(),
    }
}

Prometheus 指标采集

var (
    poolOpenConnections = prometheus.NewGaugeVec(
        prometheus.GaugeOpts{
            Name: "connection_pool_open_connections",
            Help: "Number of open connections in the pool",
        },
        []string{"service", "target"},
    )

    poolWaitDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "connection_pool_wait_duration_seconds",
            Help: "Time spent waiting for a connection",
        },
        []string{"service", "target"},
    )
)

连接复用优化

HTTP 连接池优化

// 优化的HTTP Client
func NewOptimizedClient() *http.Client {
    transport := &http.Transport{
        MaxIdleConns:        100,               // 总的空闲连接数
        MaxIdleConnsPerHost: 20,                // 每个host的空闲连接数
        MaxConnsPerHost:     0,                 // 每个host的最大连接数(0=无限制)
        IdleConnTimeout:     90 * time.Second, // 空闲连接超时
        TLSHandshakeTimeout: 10 * time.Second, // TLS握手超时
        DialContext: (&net.Dialer{
            Timeout:   5 * time.Second,  // 连接超时
            KeepAlive: 30 * time.Second, // Keep-alive间隔
        }).DialContext,
    }

    return &http.Client{
        Transport: transport,
        Timeout:   30 * time.Second, // 请求总超时
    }
}

gRPC 连接池

type GRPCPool struct {
    conns    []*grpc.ClientConn
    mu       sync.RWMutex
    target   string
    size     int
    current  int
}

func (p *GRPCPool) GetConn() *grpc.ClientConn {
    p.mu.RLock()
    defer p.mu.RUnlock()

    // 轮询获取连接
    conn := p.conns[p.current]
    p.current = (p.current + 1) % p.size

    return conn
}

故障处理策略

连接失效检测

func (p *Pool) validateConnection(conn *Connection) bool {
    // 1. 检查连接状态
    if conn.IsClosed() {
        return false
    }

    // 2. 发送心跳检测
    ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
    defer cancel()

    if err := conn.Ping(ctx); err != nil {
        log.Printf("Connection validation failed: %v", err)
        return false
    }

    return true
}

连接重试机制

func (p *Pool) GetConnectionWithRetry() (*Connection, error) {
    maxRetries := 3
    baseDelay := 100 * time.Millisecond

    for i := 0; i < maxRetries; i++ {
        conn, err := p.getConnection()
        if err == nil {
            return conn, nil
        }

        // 指数退避重试
        delay := baseDelay * time.Duration(1<<i)
        time.Sleep(delay)
    }

    return nil, errors.New("failed to get connection after retries")
}

连接池泄漏预防

自动化连接回收

type ManagedConnection struct {
    *sql.DB
    pool     *Pool
    acquired time.Time
    timeout  time.Duration
}

func (mc *ManagedConnection) Query(query string, args ...interface{}) (*sql.Rows, error) {
    // 检查连接是否超时
    if time.Since(mc.acquired) > mc.timeout {
        mc.pool.Put(mc) // 强制回收
        return nil, errors.New("connection timeout")
    }

    return mc.DB.Query(query, args...)
}

连接泄漏监控

// 定期检查连接泄漏
func (p *Pool) startLeakDetection() {
    ticker := time.NewTicker(1 * time.Minute)
    go func() {
        for range ticker.C {
            p.checkConnectionLeaks()
        }
    }()
}

func (p *Pool) checkConnectionLeaks() {
    threshold := 5 * time.Minute
    leaked := 0

    for _, conn := range p.activeConns {
        if time.Since(conn.acquired) > threshold {
            log.Printf("Potential connection leak: %s", conn.ID)
            leaked++
        }
    }

    if leaked > 0 {
        // 触发告警
        p.alertManager.SendAlert("connection_leak", leaked)
    }
}

通过合理配置连接池参数、建立监控告警、实现自动故障处理，可以显著提升系统的稳定性和性能。记住：连接池不是设置完就可以不管的，需要根据业务流量和下游服务的变化持续优化。