Skip to content

Commit 3901d3c

Browse files
author
Dementii Priadko
committed
Merge branch 'feat/lock-waits-metric-and-dashboard' into 'main'
feat(monitoring): add lock_waits metric and dashboard See merge request postgres-ai/postgres_ai!82
2 parents 2473804 + e1fd417 commit 3901d3c

File tree

8 files changed

+1951
-2
lines changed

8 files changed

+1951
-2
lines changed

config/grafana/dashboards/Dashboard_13_Lock_waits.json

Lines changed: 1113 additions & 0 deletions
Large diffs are not rendered by default.

config/pgwatch-postgres/metrics.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ metrics:
1212
queryid is not null
1313
and dbid = (select oid from pg_database where datname = current_database())
1414
order by total_exec_time desc
15-
limit 1000;
1615
gauges:
1716
- '*'
1817

@@ -198,5 +197,5 @@ presets:
198197
full:
199198
description: "Full metrics for PostgreSQL storage"
200199
metrics:
201-
pgss_queryid_queries: 300
200+
pgss_queryid_queries: 30
202201
index_definitions: 3600

config/pgwatch-prometheus/metrics.yml

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,71 @@ metrics:
819819
gauges:
820820
- queries
821821
statement_timeout_seconds: 15
822+
lock_waits:
823+
description: >
824+
Retrieves detailed information about lock waits, including blocked and blocking processes with their queries, users, and application names.
825+
It returns blocked and blocker process IDs, lock modes and types, affected tables, queries, and wait/transaction durations.
826+
This metric helps administrators identify and diagnose lock contention issues in detail.
827+
sqls:
828+
14: |-
829+
with sa_snapshot as ( /* pgwatch_generated */
830+
select *
831+
from pg_stat_activity
832+
where
833+
datname = current_database()
834+
and pid <> pg_backend_pid()
835+
and state in ('active', 'idle in transaction', 'idle in transaction (aborted)')
836+
),
837+
pid_tables as (
838+
select distinct on (pid) pid, relation::regclass::text as table_name
839+
from pg_catalog.pg_locks
840+
where relation is not null
841+
and locktype in ('tuple', 'relation')
842+
and relation::regclass::text not like '%_pkey'
843+
and relation::regclass::text not like '%_idx'
844+
order by pid, locktype
845+
)
846+
select
847+
blocked.pid as blocked_pid,
848+
current_database() as tag_datname,
849+
blocked_stm.usename::text as tag_blocked_user,
850+
blocked_stm.application_name::text as tag_blocked_appname,
851+
blocked.mode as blocked_mode,
852+
blocked.locktype as blocked_locktype,
853+
coalesce(blocked.relation::regclass::text, blocked_tbl.table_name, '') as tag_blocked_table,
854+
blocked_stm.query_id::text as tag_blocked_query_id,
855+
(extract(epoch from (clock_timestamp() - blocked_stm.state_change)) * 1000)::bigint as blocked_ms,
856+
blocker.pid as blocker_pid,
857+
blocker_stm.usename::text as tag_blocker_user,
858+
blocker_stm.application_name::text as tag_blocker_appname,
859+
blocker.mode as blocker_mode,
860+
blocker.locktype as blocker_locktype,
861+
coalesce(blocker.relation::regclass::text, blocker_tbl.table_name, '') as tag_blocker_table,
862+
blocker_stm.query_id::text as tag_blocker_query_id,
863+
(extract(epoch from (clock_timestamp() - blocker_stm.xact_start)) * 1000)::bigint as blocker_tx_ms
864+
from pg_catalog.pg_locks as blocked
865+
join sa_snapshot as blocked_stm on blocked_stm.pid = blocked.pid
866+
join pg_catalog.pg_locks as blocker on
867+
blocked.pid <> blocker.pid
868+
and blocker.granted
869+
and (
870+
(blocked.database = blocker.database)
871+
or (blocked.database is null and blocker.database is null)
872+
)
873+
and (
874+
blocked.relation = blocker.relation
875+
or blocked.transactionid = blocker.transactionid
876+
)
877+
join sa_snapshot as blocker_stm on blocker_stm.pid = blocker.pid
878+
left join pid_tables as blocked_tbl on blocked_tbl.pid = blocked.pid
879+
left join pid_tables as blocker_tbl on blocker_tbl.pid = blocker.pid
880+
where not blocked.granted
881+
order by blocked_ms desc
882+
limit 10000
883+
gauges:
884+
- blocked_ms
885+
- blocker_tx_ms
886+
statement_timeout_seconds: 15
822887
pg_database_wraparound:
823888
sqls:
824889
11: |
@@ -2335,6 +2400,7 @@ presets:
23352400
pg_statio_all_tables: 30
23362401
pg_statio_all_indexes: 30
23372402
pg_total_relation_size: 30
2403+
lock_waits: 30
23382404
pg_blocked: 30
23392405
pg_long_running_transactions: 30
23402406
pg_stuck_idle_in_transaction: 30

tests/lock_waits/README.md

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# Lock Waits Metric Testing
2+
3+
This directory contains tests and scripts to verify that the `lock_waits` metric is working correctly.
4+
5+
## Overview
6+
7+
The `lock_waits` metric collects detailed information about lock waits in PostgreSQL, including:
8+
- Waiting and blocking process IDs
9+
- User names and application names
10+
- Lock modes and types
11+
- Affected tables
12+
- Query IDs (PostgreSQL 14+)
13+
- Wait durations and blocker transaction durations
14+
15+
## Test Components
16+
17+
### 1. Python Test Script (`test_lock_waits_metric.py`)
18+
19+
Automated test that:
20+
- Creates lock contention scenarios in the target database
21+
- Waits for pgwatch to collect metrics
22+
- Verifies the metric is collected in Prometheus/VictoriaMetrics
23+
- Validates the metric structure and labels
24+
25+
### 2. SQL Script (`create_lock_contention.sql`)
26+
27+
Manual SQL script to create lock contention for testing. Can be run in multiple psql sessions.
28+
29+
## Prerequisites
30+
31+
1. Docker Compose stack running:
32+
```bash
33+
docker-compose up -d
34+
```
35+
36+
2. Python dependencies:
37+
```bash
38+
pip install psycopg requests
39+
```
40+
41+
3. Ensure `lock_waits` metric is enabled in pgwatch configuration:
42+
- Check `config/pgwatch-prometheus/metrics.yml` includes `lock_waits`
43+
- Verify pgwatch is collecting metrics from the target database
44+
45+
## Running the Automated Test
46+
47+
### Basic Usage
48+
49+
```bash
50+
# From the project root
51+
python tests/lock_waits/test_lock_waits_metric.py
52+
```
53+
54+
### With Custom Configuration
55+
56+
```bash
57+
python tests/lock_waits/test_lock_waits_metric.py \
58+
--target-db-url "postgresql://postgres:postgres@localhost:55432/target_database" \
59+
--prometheus-url "http://localhost:59090" \
60+
--test-dbname "target_database" \
61+
--collection-wait 90
62+
```
63+
64+
### Environment Variables
65+
66+
You can also set these via environment variables:
67+
68+
```bash
69+
export TARGET_DB_URL="postgresql://postgres:postgres@localhost:55432/target_database"
70+
export PROMETHEUS_URL="http://localhost:59090"
71+
export TEST_DBNAME="target_database"
72+
export COLLECTION_WAIT_SECONDS=90
73+
74+
python tests/lock_waits/test_lock_waits_metric.py
75+
```
76+
77+
## Manual Testing
78+
79+
### Step 1: Create Lock Contention
80+
81+
Open two psql sessions to the target database:
82+
83+
**Session 1 (Blocker):**
84+
```sql
85+
BEGIN;
86+
SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE;
87+
-- Keep this transaction open
88+
```
89+
90+
**Session 2 (Waiter):**
91+
```sql
92+
BEGIN;
93+
SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE;
94+
-- This will wait for Session 1 to release the lock
95+
```
96+
97+
### Step 2: Verify Metric Collection
98+
99+
Wait for pgwatch to collect metrics (check collection interval in pgwatch config, typically 15-30 seconds), then query Prometheus:
100+
101+
```bash
102+
# Query Prometheus API for lock_waits metrics
103+
curl "http://localhost:59090/api/v1/query?query=pgwatch_lock_waits_waiting_ms{datname=\"target_database\"}"
104+
105+
# Or use PromQL in Grafana Explore
106+
pgwatch_lock_waits_waiting_ms{datname="target_database"}
107+
pgwatch_lock_waits_blocker_tx_ms{datname="target_database"}
108+
```
109+
110+
### Step 3: Check Grafana Dashboard
111+
112+
1. Open Grafana: http://localhost:3000
113+
2. Navigate to "Lock waits details" dashboard
114+
3. Select the database from the dropdown
115+
4. Verify that lock wait events appear in the panels
116+
117+
## Expected Results
118+
119+
### Successful Test Output
120+
121+
```
122+
Setting up test environment...
123+
✓ Test table created
124+
125+
Creating lock contention for 30 seconds...
126+
✓ Blocker transaction started (holding lock on row id=1)
127+
✓ Waiter transaction started (waiting for lock on row id=1)
128+
Holding locks for 30 seconds...
129+
✓ Lock contention ended
130+
131+
Verifying metric collection...
132+
Waiting 60 seconds for pgwatch to collect metrics...
133+
✓ Found 5 lock_waits records
134+
135+
Validating metric structure...
136+
137+
Record 1:
138+
✓ All required data fields present
139+
✓ waiting_ms is numeric: 25000 ms
140+
✓ blocker_tx_ms is numeric: 30000 ms
141+
142+
✅ Test PASSED: lock_waits metric is working correctly
143+
```
144+
145+
## Troubleshooting
146+
147+
### No Records Found
148+
149+
- **Check pgwatch is running**: `docker ps | grep pgwatch-prometheus`
150+
- **Check pgwatch logs**: `docker logs pgwatch-prometheus`
151+
- **Verify metric is enabled**: Check `config/pgwatch-prometheus/metrics.yml`
152+
- **Check Prometheus is accessible**: `curl http://localhost:59090/api/v1/status/config`
153+
- **Increase wait time**: Use `--collection-wait 120` to wait longer
154+
- **Check database name**: Ensure `--test-dbname` matches the monitored database
155+
- **Verify metrics exist**: `curl "http://localhost:59090/api/v1/label/__name__/values" | grep lock_waits`
156+
157+
### Invalid Data Structure
158+
159+
- **Check PostgreSQL version**: Metric requires PostgreSQL 14+ for query_id support
160+
- **Verify metric SQL**: Check the SQL query in `metrics.yml` is correct
161+
- **Check pgwatch version**: Ensure pgwatch version supports the metric format
162+
- **Check Prometheus labels**: Verify metrics have expected labels (datname, waiting_pid, blocker_pid, etc.)
163+
164+
### Connection Errors
165+
166+
- **Verify Docker containers**: `docker-compose ps`
167+
- **Check connection strings**: Verify URLs match your docker-compose configuration
168+
- **Check Prometheus URL**: Ensure Prometheus/VictoriaMetrics is accessible at the specified URL
169+
- **Check network**: Ensure containers can communicate (same Docker network)
170+
171+
## Integration with CI/CD
172+
173+
The test can be integrated into CI/CD pipelines:
174+
175+
```yaml
176+
# Example GitLab CI
177+
test_lock_waits:
178+
stage: test
179+
script:
180+
- docker-compose up -d
181+
- sleep 30 # Wait for services to start
182+
- pip install psycopg
183+
- python tests/lock_waits/test_lock_waits_metric.py
184+
--target-db-url "$TARGET_DB_URL"
185+
--sink-db-url "$SINK_DB_URL"
186+
--collection-wait 90
187+
only:
188+
- merge_requests
189+
- main
190+
```
191+
192+
## Additional Test Scenarios
193+
194+
### Test Different Lock Types
195+
196+
Modify the test to create different types of locks:
197+
198+
```sql
199+
-- Table-level lock
200+
LOCK TABLE lock_test_table IN EXCLUSIVE MODE;
201+
202+
-- Advisory lock
203+
SELECT pg_advisory_lock(12345);
204+
```
205+
206+
### Test Multiple Concurrent Waits
207+
208+
Create multiple waiting transactions to test the LIMIT clause:
209+
210+
```sql
211+
-- Session 1: Blocker
212+
BEGIN;
213+
SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE;
214+
215+
-- Sessions 2-10: Multiple waiters
216+
-- Each in separate psql session
217+
BEGIN;
218+
SELECT * FROM lock_test_table WHERE id = 1 FOR UPDATE;
219+
```
220+
221+
## Related Files
222+
223+
- `config/pgwatch-prometheus/metrics.yml` - Metric definition
224+
- `config/grafana/dashboards/Dashboard_13_Lock_waits.json` - Grafana dashboard
225+
- `workload_examples/lock_wait_test.sql` - Basic lock test SQL
226+

tests/lock_waits/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Lock waits metric testing package
2+
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
-- SQL script to manually create lock contention for testing lock_waits metric
2+
--
3+
-- Usage:
4+
-- 1. Run this script in Session 1 (blocker)
5+
-- 2. Run the same script in Session 2 (waiter) - it will wait
6+
-- 3. Check the sink database for lock_waits records
7+
-- 4. Commit or rollback Session 1 to release the lock
8+
9+
-- Create test table if it doesn't exist
10+
drop table if exists lock_test_table cascade;
11+
create table lock_test_table (
12+
id int8 generated always as identity primary key,
13+
name text not null,
14+
value numeric(10, 2),
15+
created_at timestamptz default now()
16+
);
17+
18+
insert into lock_test_table (name, value)
19+
values
20+
('Item 1', 100.50),
21+
('Item 2', 200.75),
22+
('Item 3', 300.25);
23+
24+
-- ============================================
25+
-- SESSION 1 (BLOCKER) - Run this first
26+
-- ============================================
27+
begin;
28+
29+
-- Acquire exclusive lock on row id=1
30+
-- Keep this transaction open to hold the lock
31+
select * from lock_test_table where id = 1 for update;
32+
33+
-- Transaction is now holding the lock
34+
-- DO NOT COMMIT YET - keep this session open
35+
36+
-- ============================================
37+
-- SESSION 2 (WAITER) - Run this in another psql session
38+
-- ============================================
39+
begin;
40+
41+
-- This will wait for Session 1 to release the lock
42+
select * from lock_test_table where id = 1 for update;
43+
44+
-- This query will block until Session 1 commits or rolls back
45+
-- You should see it waiting in pg_stat_activity
46+
47+
-- ============================================
48+
-- To release the lock, commit or rollback Session 1:
49+
-- ============================================
50+
-- commit; -- or rollback;
51+
52+
-- ============================================
53+
-- Alternative: Test with different lock types
54+
-- ============================================
55+
56+
-- Test with table-level lock
57+
-- SESSION 1:
58+
-- begin;
59+
-- lock table lock_test_table in exclusive mode;
60+
61+
-- SESSION 2:
62+
-- begin;
63+
-- select * from lock_test_table; -- Will wait
64+
65+
-- Test with advisory lock
66+
-- SESSION 1:
67+
-- begin;
68+
-- select pg_advisory_lock(12345);
69+
70+
-- SESSION 2:
71+
-- begin;
72+
-- select pg_advisory_lock(12345); -- Will wait
73+

0 commit comments

Comments
 (0)