Nginx反向代理的性能优化

Nginx 反向代理的性能优化

Nginx 作为反向代理在我们的架构中承担着重要角色。从单机几千 QPS 到集群处理十万级并发,我在 Nginx 调优方面积累了不少经验。这里分享一些实战中验证有效的优化策略。

基础性能调优

worker 进程配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# nginx.conf 核心配置
user nginx;
worker_processes auto; # 自动设置为CPU核心数

# 绑定worker进程到特定CPU核心
worker_cpu_affinity auto;

# 单个worker的最大连接数
events {
worker_connections 65535;
use epoll; # Linux使用epoll
multi_accept on; # 允许一次接收多个连接
}

# 文件句柄限制
worker_rlimit_nofile 100000;

连接处理优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
http {
# 连接保持配置
keepalive_timeout 60s; # 保持连接时间
keepalive_requests 10000; # 单连接最大请求数

# 客户端配置
client_max_body_size 10m; # 最大请求体大小
client_body_timeout 10s; # 请求体超时
client_header_timeout 10s; # 请求头超时

# 发送配置
send_timeout 10s; # 响应超时
sendfile on; # 零拷贝文件传输
tcp_nopush on; # 批量发送数据
tcp_nodelay on; # 禁用Nagle算法
}

反向代理优化

upstream 配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# 后端服务器池配置
upstream backend_pool {
# 负载均衡策略
least_conn; # 最少连接数算法

# 后端服务器配置
server 192.168.1.10:8080 weight=3 max_fails=2 fail_timeout=10s;
server 192.168.1.11:8080 weight=3 max_fails=2 fail_timeout=10s;
server 192.168.1.12:8080 weight=2 max_fails=2 fail_timeout=10s backup;

# 连接池优化
keepalive 300; # 保持300个连接到上游
keepalive_requests 1000; # 每个连接最多1000个请求
keepalive_timeout 60s; # 连接保持时间
}

server {
listen 80;
server_name api.example.com;

location /api/ {
proxy_pass http://backend_pool;

# 连接优化
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_connect_timeout 5s; # 连接超时
proxy_read_timeout 30s; # 读取超时
proxy_send_timeout 30s; # 发送超时

# 请求头优化
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;

# 缓冲区优化
proxy_buffering on;
proxy_buffer_size 8k; # 第一部分响应缓冲区
proxy_buffers 8 8k; # 响应缓冲区数量和大小
proxy_busy_buffers_size 16k; # 忙碌缓冲区大小
}
}

实战案例:API 网关优化

问题现象:高峰期 API 响应时间 P99 超过 3 秒,Nginx error log 出现大量 upstream timeout

分析过程

1
2
3
4
5
6
7
8
9
# 1. 查看Nginx状态
curl http://localhost/nginx_status
# Active connections: 15000
# server accepts handled requests: 1000000 1000000 2000000
# Reading: 100 Writing: 200 Waiting: 14700

# 2. 分析error log
tail -f /var/log/nginx/error.log | grep timeout
# upstream timed out (110: Connection timed out) while connecting to upstream

优化方案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 优化upstream配置
upstream api_servers {
# 增加服务器数量
server 10.0.1.10:8080 weight=5 max_fails=3 fail_timeout=30s;
server 10.0.1.11:8080 weight=5 max_fails=3 fail_timeout=30s;
server 10.0.1.12:8080 weight=5 max_fails=3 fail_timeout=30s;
server 10.0.1.13:8080 weight=3 max_fails=3 fail_timeout=30s;

# 优化连接池
keepalive 500; # 增加保持连接数
keepalive_requests 10000; # 增加单连接请求数
}

# 优化proxy配置
location /api/ {
proxy_pass http://api_servers;

# 调整超时时间
proxy_connect_timeout 3s; # 降低连接超时
proxy_read_timeout 60s; # 增加读取超时
proxy_send_timeout 60s;

# 启用连接复用
proxy_http_version 1.1;
proxy_set_header Connection "";

# 优化缓冲区
proxy_buffering on;
proxy_buffer_size 16k;
proxy_buffers 16 16k;
proxy_busy_buffers_size 32k;
}

效果:P99 延迟降到 500ms,错误率从 5%降到 0.1%

缓存策略优化

静态文件缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 静态资源缓存配置
location ~* \.(jpg|jpeg|png|gif|ico|css|js)$ {
expires 1y; # 缓存1年
add_header Cache-Control "public, immutable";
add_header Vary "Accept-Encoding";

# 压缩配置
gzip on;
gzip_vary on;
gzip_comp_level 6;
gzip_types
text/plain
text/css
text/xml
text/javascript
application/javascript
application/json
application/xml+rss;
}

API 响应缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 设置缓存路径和参数
proxy_cache_path /var/cache/nginx/api
levels=1:2
keys_zone=api_cache:100m
max_size=10g
inactive=60m
use_temp_path=off;

server {
location /api/static/ {
proxy_pass http://backend_pool;

# 缓存配置
proxy_cache api_cache;
proxy_cache_key "$scheme$request_method$host$request_uri";
proxy_cache_valid 200 302 10m; # 成功响应缓存10分钟
proxy_cache_valid 404 1m; # 404缓存1分钟
proxy_cache_valid any 5m; # 其他响应缓存5分钟

# 缓存控制
proxy_cache_use_stale error timeout invalid_header updating;
proxy_cache_lock on; # 防止缓存击穿
proxy_cache_lock_timeout 3s;

# 缓存头信息
add_header X-Cache-Status $upstream_cache_status;
}
}

限流和安全优化

请求限制配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
http {
# 定义限流区域
limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
limit_req_zone $server_name zone=perserver:10m rate=1000r/s;

# 连接限制
limit_conn_zone $binary_remote_addr zone=addr:10m;

server {
# 应用限流规则
limit_req zone=api burst=20 nodelay; # API限流:10r/s,突发20
limit_req zone=perserver burst=100; # 服务器限流
limit_conn addr 10; # 单IP最多10个连接

# 限制请求大小和速度
client_body_timeout 10s;
client_max_body_size 10m;
limit_rate_after 1m; # 1MB后开始限速
limit_rate 500k; # 限制下载速度500KB/s

location /api/ {
# 特定接口限流
limit_req zone=api burst=5 nodelay;
proxy_pass http://backend_pool;
}
}
}

安全头配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
server {
# 安全头
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
add_header Referrer-Policy "no-referrer-when-downgrade" always;
add_header Content-Security-Policy "default-src 'self'" always;

# 隐藏版本信息
server_tokens off;

# 防止某些攻击
if ($request_method !~ ^(GET|HEAD|POST)$) {
return 405;
}

# 过滤恶意请求
location ~ /\. {
deny all;
}
}

监控和日志优化

访问日志格式

1
2
3
4
5
6
7
8
9
10
11
12
# 自定义日志格式
log_format main_ext '$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent" '
'$request_time $upstream_response_time '
'$upstream_addr $upstream_status';

# 应用日志格式
access_log /var/log/nginx/access.log main_ext buffer=64k flush=1m;

# 错误日志
error_log /var/log/nginx/error.log warn;

性能监控配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# 启用状态页面
server {
listen 127.0.0.1:80;
server_name localhost;

location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
deny all;
}

# 详细状态信息 (需要nginx-module-vts)
location /status {
vhost_traffic_status_display;
vhost_traffic_status_display_format html;
access_log off;
}
}

高级优化技巧

SSL 优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
server {
listen 443 ssl http2; # 启用HTTP/2

# SSL证书配置
ssl_certificate /path/to/cert.pem;
ssl_certificate_key /path/to/key.pem;

# SSL优化
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-RSA-AES256-GCM-SHA512:DHE-RSA-AES256-GCM-SHA512;
ssl_prefer_server_ciphers off;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
ssl_session_tickets off;

# OCSP Stapling
ssl_stapling on;
ssl_stapling_verify on;
resolver 8.8.8.8 8.8.4.4 valid=300s;
resolver_timeout 5s;
}

动态 upstream

1
2
3
4
5
6
7
8
9
10
11
# 使用nginx-plus或第三方模块实现动态upstream
upstream dynamic_backend {
zone backend 64k;

# 从配置中心获取服务器列表
server backend1.example.com service=backend weight=5;
server backend2.example.com service=backend weight=5;

# 健康检查
health_check interval=5s fails=3 passes=2 uri=/health;
}

自动化配置管理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/bin/bash
# nginx_config_update.sh - 自动更新nginx配置

# 从服务发现获取后端服务器列表
BACKEND_SERVERS=$(curl -s http://consul:8500/v1/health/service/api-server | jq -r '.[] | select(.Checks[].Status == "passing") | .Service.Address + ":" + (.Service.Port | tostring)')

# 生成新的upstream配置
cat > /etc/nginx/conf.d/upstream.conf << EOF
upstream backend_pool {
least_conn;
keepalive 300;
EOF

for server in $BACKEND_SERVERS; do
echo " server $server weight=1 max_fails=2 fail_timeout=10s;" >> /etc/nginx/conf.d/upstream.conf
done

echo "}" >> /etc/nginx/conf.d/upstream.conf

# 测试配置并重载
if nginx -t; then
nginx -s reload
echo "Nginx configuration updated successfully"
else
echo "Nginx configuration test failed"
exit 1
fi

性能测试验证

基准测试脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash
# nginx_benchmark.sh

URL="http://localhost/api/test"
CONCURRENCY=100
REQUESTS=10000

echo "Testing Nginx performance..."

# 使用ab进行测试
ab -n $REQUESTS -c $CONCURRENCY -k $URL

# 使用wrk进行测试
wrk -t12 -c400 -d30s --latency $URL

# 监控Nginx状态
while true; do
curl -s http://localhost/nginx_status
sleep 5
done

通过系统性的性能优化,Nginx 反向代理可以轻松处理数万并发连接。关键是要根据实际业务场景调整配置参数,并建立完善的监控体系来持续优化。记住,优化是一个迭代的过程,需要不断测试和调整。