Analyzing Replica Inconsistency in ClickHouse

Ordiy Lv5

环境信息

1
2
clickhouse version: 24.10.x 
clickhouse 集群: 2 shard 2 replicate

问题描述

执行SQL统计时,对历史数据进行统计时发现SQL多次执行的结果不幂等。

1
2
3
4
5
6
7
-- 定位问题
-- 正常情况 一个唯一ID查询到的行数=副本数
SELECT
hostName() AS _name,
unique_id
FROM clusterAllReplicas('my_ch_cluster_name', dev.table_local)
WHERE unique_id = 'Lhq-xxxx'

排查

分片2的2个副本的数据不一致,说明分片间数据同步有问题。 一个副本有数据,另一个则没有,可能是其中一个副本节点在fetch Entity环节出现问题了(Clickhouse 提供的最终一致性Eventual consistency)

1
2
3
4
5
6
数据写入 --> part Entitiy 
|
注册到 KeeperZooKeeper/ClickHouse Keeper
|
其它副本节点拉取

查看replicate zookeeper path:

1
2
3
4
5
SELECT
hostName() AS _host, table,
zookeeper_path
FROM clusterAllReplicas(oci_ck_cluster, system.replicas)
WHERE `table` = 'launcher_events_local'

2个副本的zk path不一致: /clickhouse/tables/51724fc0-b8bf-4051-a533-3ad15ba10e23/02 /clickhouse/tables/7c6def4c-7868-4b03-b64a-579641798663/02

解决问题

方案A-修复副本zk path

需要-暂停该的所有数据写入的情况下操作

准备工作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
--备份数据
-- 优化目标:只扫描一次表,并使用聚合过滤唯一 unique_id
-- 确认修复 node05 节点的zk path , 将该节点上存在但另一个副本不存在的数据进行备份

-- step 1 数据规模
WITH raw_data AS (
SELECT hostName() AS _name, unique_id
FROM clusterAllReplicas(oci_ck_cluster, common_shared_db_ods.launcher_events_local)
),
unique_once AS (
SELECT unique_id
FROM raw_data
GROUP BY unique_id
HAVING count(*) = 1
)
SELECT _name, count(*) AS line_nums
FROM raw_data
WHERE unique_id IN (SELECT unique_id FROM unique_once)
GROUP BY _name;

-- 备份数据到一个临时表

CREATE TABLE common_shared_db_ods.launcher_events_local_node05_tmp
(
`id` UInt64,
`unique_id` String DEFAULT '' COMMENT '事件唯一标识符',
`url` String DEFAULT '' COMMENT 'URL',
`event` String DEFAULT '' COMMENT '事件名称',
`ua` String DEFAULT '' COMMENT 'user agent',
`ip` String DEFAULT '' COMMENT 'IP',
`ts` UInt64 DEFAULT toUnixTimestamp(now()) COMMENT '事件发生的Unix时间戳(秒)',
`properties` String DEFAULT '' COMMENT 'JSON格式字符串的事件属性和元数据' CODEC(LZ4),
`is_reported` UInt8 DEFAULT 0 COMMENT '是否已上报到第三方平台(0=未上报,1=已上报)',
`report_ts` UInt64 DEFAULT 0 COMMENT '上报到第三方平台的时间戳(秒)',
`report_response` String DEFAULT '' COMMENT '第三方平台上报的响应内容' CODEC(LZ4),
`channel_id` UInt8 DEFAULT 0 COMMENT '渠道ID,标识流量来源',
`clickid` String DEFAULT '' COMMENT '点击ID,用于跟踪广告点击转化',
`device_id` String DEFAULT '' COMMENT '设备唯一标识符',
`created_at` DateTime64(3) DEFAULT now64(3) COMMENT '事件创建时间(精确到毫秒)',
`device_first_seen_at` DateTime64(3) DEFAULT toDateTime64(0, 3) COMMENT '设备首次出现的时间(精确到毫秒)',
`msg_event_time` DateTime64(3, 'UTC') DEFAULT now(),
INDEX idx_bf_unique_id unique_id TYPE bloom_filter(0.01) GRANULARITY 1,
INDEX idx_bf_device_id device_id TYPE bloom_filter(0.01) GRANULARITY 1,
INDEX idx_bf_clickid clickid TYPE bloom_filter(0.01) GRANULARITY 1
)
ENGINE = MergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}')
PARTITION BY toYYYYMMDD(created_at)
PRIMARY KEY (toStartOfDay(created_at), event, channel_id)
ORDER BY (toStartOfDay(created_at), event, channel_id)
SETTINGS index_granularity = 8192
COMMENT 'launcher事件表' ;


-- 创建一个在node 05 节点的 MergeTree table
CREATE TABLE IF NOT EXISTS common_shared_db_ods.launcher_events_local_node05_local_backup_tmp
AS common_shared_db_ods.launcher_events_local
ENGINE = MergeTree()

-- 迁移未同步的数据到 (只在node05节点上存在的数据)
--common_shared_db_ods.launcher_events_local_node05_local_backup_tmp
WITH raw_data AS (
SELECT hostName() AS _name, unique_id
FROM clusterAllReplicas(oci_ck_cluster, common_shared_db_ods.launcher_events_local)
),
uniq_hosts AS (
SELECT unique_id, groupUniqArray(_name) AS hosts
FROM raw_data
GROUP BY unique_id
HAVING length(hosts) = 1 AND hosts[1] = 'oci-data-clickhouse-node-05.oci-us-internal.com'
)
insert into common_shared_db_ods.launcher_events_local_node05_local_backup_tmp
select * from common_shared_db_ods.launcher_events_local
where unique_id in (
SELECT unique_id FROM uniq_hosts
)



修复副本 zk path

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# on node-05 
CREATE TABLE common_shared_db_ods.launcher_events_local
(
`id` UInt64,
`unique_id` String DEFAULT '' COMMENT '事件唯一标识符',
`url` String DEFAULT '' COMMENT 'URL',
`event` String DEFAULT '' COMMENT '事件名称',
`ua` String DEFAULT '' COMMENT 'user agent',
`ip` String DEFAULT '' COMMENT 'IP',
`ts` UInt64 DEFAULT toUnixTimestamp(now()) COMMENT '事件发生的Unix时间戳(秒)',
`properties` String DEFAULT '' COMMENT 'JSON格式字符串的事件属性和元数据' CODEC(LZ4),
`is_reported` UInt8 DEFAULT 0 COMMENT '是否已上报到第三方平台(0=未上报,1=已上报)',
`report_ts` UInt64 DEFAULT 0 COMMENT '上报到第三方平台的时间戳(秒)',
`report_response` String DEFAULT '' COMMENT '第三方平台上报的响应内容' CODEC(LZ4),
`channel_id` UInt8 DEFAULT 0 COMMENT '渠道ID,标识流量来源',
`clickid` String DEFAULT '' COMMENT '点击ID,用于跟踪广告点击转化',
`device_id` String DEFAULT '' COMMENT '设备唯一标识符',
`created_at` DateTime64(3) DEFAULT now64(3) COMMENT '事件创建时间(精确到毫秒)',
`device_first_seen_at` DateTime64(3) DEFAULT toDateTime64(0, 3) COMMENT '设备首次出现的时间(精确到毫秒)',
`msg_event_time` DateTime64(3, 'UTC') DEFAULT now(),
INDEX idx_bf_unique_id unique_id TYPE bloom_filter(0.01) GRANULARITY 1,
INDEX idx_bf_device_id device_id TYPE bloom_filter(0.01) GRANULARITY 1,
INDEX idx_bf_clickid clickid TYPE bloom_filter(0.01) GRANULARITY 1
)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/7c6def4c-7868-4b03-b64a-579641798663/02', '{replica}')
PARTITION BY toYYYYMMDD(created_at)
PRIMARY KEY (toStartOfDay(created_at), event, channel_id)
ORDER BY (toStartOfDay(created_at), event, channel_id)
SETTINGS index_granularity = 8192
COMMENT 'launcher事件表';

重建表后,数据会自动同步

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
-- 检验数据同步状态
WITH raw_data AS (
SELECT hostName() AS _name, unique_id
FROM clusterAllReplicas(oci_ck_cluster, common_shared_db_ods.launcher_events_local)
),
unique_once AS (
SELECT unique_id
FROM raw_data
GROUP BY unique_id
HAVING count(*) = 1
)
SELECT _name, count(*) AS line_nums
FROM raw_data
WHERE unique_id IN (SELECT unique_id FROM unique_once)
GROUP BY _name;

将备份表回写

1
2
3
4
-- node 05 执行
insert into common_shared_db_ods.launcher_events_local
select * from
launcher_events_local_node05_local_backup_tmp ;
  • 检查数据同步状态
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
--检查同步状态,没有查询到结果说明数据已经同步
WITH raw_data AS (
SELECT hostName() AS _name, unique_id
FROM clusterAllReplicas(oci_ck_cluster, common_shared_db_ods.launcher_events_local)
),
unique_once AS (
SELECT unique_id
FROM raw_data
GROUP BY unique_id
HAVING count(*) = 1
)
SELECT _name, count(*) AS line_nums
FROM raw_data
WHERE unique_id IN (SELECT unique_id FROM unique_once)
GROUP BY _name;

修复后验证之前的问题

1
2
3
select  hostName() AS _name ,`unique_id` 
FROM clusterAllReplicas(oci_ck_cluster, common_shared_db_ods.launcher_events_local)
WHERE unique_id ='Lhq-kyL-FMp9MPQz8wd8j' ;

重建表

暂停表的数据写入,开始备份数据 适用场景:表数据规模较小的情况(比如 10G 以内)

导致问题的原因

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
-- CREATE  table 
CREATE TABLE test_db.test_table_v1
(
`id` Int32,
`customer_id` Int32,
`street` String,
`city` String,
`state` String,
`zip` String,
`type` Enum8('SHIPPING' = 1, 'BILLING' = 2, 'LIVING' = 3)
)
ENGINE = ReplicatedMergeTree()
PRIMARY KEY id
ORDER BY id ;

默认的ReplicatedMergeTree会转换为ReplicatedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}') 。 每次DDL执行uuid都是不一样,这导致同一个在多个节点上多次执行 zk path 完全不一样。

  • 建议使用data_base + table_name 命名zk path
  • 使用 ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/{your_database_name}/{your_table_name}', '{replica}')·
  • CREATE table 前需要 CREATE table IF NOT EXISTS xxx

参考

  • Title: Analyzing Replica Inconsistency in ClickHouse
  • Author: Ordiy
  • Created at : 2025-07-10 15:40:38
  • Updated at : 2026-03-02 04:17:17
  • Link: https://ordiy.github.io/posts/2025-06-01-Analyzing-Replica-Inconsistency-in-ClickHouse/
  • License: This work is licensed under CC BY-NC-SA 4.0.
Comments