1
1
from typing import Any , Dict , List
2
2
3
3
from confluent_kafka import Consumer , Message , TopicPartition # type: ignore
4
- from confluent_kafka .admin import AdminClient , TopicMetadata # type: ignore
4
+ from confluent_kafka .admin import TopicMetadata # type: ignore
5
5
6
6
from dlt import config , secrets
7
7
from dlt .common import pendulum
@@ -54,15 +54,17 @@ def default_msg_processor(msg: Message) -> Dict[str, Any]:
54
54
class OffsetTracker (dict ): # type: ignore
55
55
"""Object to control offsets of the given topics.
56
56
57
- Tracks all the partitions of the given topics with two params:
58
- current offset and maximum offset (partition length).
57
+ Tracks all the partitions of the given topics with three params:
58
+ current offset, maximum offset (partition length), and an end time .
59
59
60
60
Args:
61
61
consumer (confluent_kafka.Consumer): Kafka consumer.
62
62
topic_names (List): Names of topics to track.
63
63
pl_state (DictStrAny): Pipeline current state.
64
64
start_from (Optional[pendulum.DateTime]): A timestamp, after which messages
65
65
are read. Older messages are ignored.
66
+ end_time (Optional[pendulum.DateTime]): A timestamp, before which messages
67
+ are read. Newer messages are ignored.
66
68
"""
67
69
68
70
def __init__ (
@@ -71,6 +73,7 @@ def __init__(
71
73
topic_names : List [str ],
72
74
pl_state : DictStrAny ,
73
75
start_from : pendulum .DateTime = None ,
76
+ end_time : pendulum .DateTime = None ,
74
77
):
75
78
super ().__init__ ()
76
79
@@ -82,7 +85,7 @@ def __init__(
82
85
"offsets" , {t_name : {} for t_name in topic_names }
83
86
)
84
87
85
- self ._init_partition_offsets (start_from )
88
+ self ._init_partition_offsets (start_from , end_time )
86
89
87
90
def _read_topics (self , topic_names : List [str ]) -> Dict [str , TopicMetadata ]:
88
91
"""Read the given topics metadata from Kafka.
@@ -104,7 +107,9 @@ def _read_topics(self, topic_names: List[str]) -> Dict[str, TopicMetadata]:
104
107
105
108
return tracked_topics
106
109
107
- def _init_partition_offsets (self , start_from : pendulum .DateTime ) -> None :
110
+ def _init_partition_offsets (
111
+ self , start_from : pendulum .DateTime , end_time : pendulum .DateTime
112
+ ) -> None :
108
113
"""Designate current and maximum offsets for every partition.
109
114
110
115
Current offsets are read from the state, if present. Set equal
@@ -113,6 +118,8 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
113
118
Args:
114
119
start_from (pendulum.DateTime): A timestamp, at which to start
115
120
reading. Older messages are ignored.
121
+ end_time (pendulum.DateTime): A timestamp, before which messages
122
+ are read. Newer messages are ignored.
116
123
"""
117
124
all_parts = []
118
125
for t_name , topic in self ._topics .items ():
@@ -128,27 +135,38 @@ def _init_partition_offsets(self, start_from: pendulum.DateTime) -> None:
128
135
for part in topic .partitions
129
136
]
130
137
131
- # get offsets for the timestamp, if given
132
- if start_from is not None :
133
- ts_offsets = self ._consumer .offsets_for_times (parts )
138
+ # get offsets for the timestamp ranges, if given
139
+ if start_from is not None and end_time is not None :
140
+ start_ts_offsets = self ._consumer .offsets_for_times (parts )
141
+ end_ts_offsets = self ._consumer .offsets_for_times (
142
+ [
143
+ TopicPartition (t_name , part , end_time .int_timestamp * 1000 )
144
+ for part in topic .partitions
145
+ ]
146
+ )
134
147
135
148
# designate current and maximum offsets for every partition
136
149
for i , part in enumerate (parts ):
137
150
max_offset = self ._consumer .get_watermark_offsets (part )[1 ]
138
151
139
- if start_from is not None :
140
- if ts_offsets [i ].offset != - 1 :
141
- cur_offset = ts_offsets [i ].offset
152
+ if start_from is not None and end_time is not None :
153
+ if start_ts_offsets [i ].offset != - 1 :
154
+ cur_offset = start_ts_offsets [i ].offset
142
155
else :
143
156
cur_offset = max_offset - 1
157
+ if end_ts_offsets [i ].offset != - 1 :
158
+ end_offset = end_ts_offsets [i ].offset
159
+ else :
160
+ end_offset = max_offset
144
161
else :
145
162
cur_offset = (
146
163
self ._cur_offsets [t_name ].get (str (part .partition ), - 1 ) + 1
147
164
)
165
+ end_offset = max_offset
148
166
149
167
self [t_name ][str (part .partition )] = {
150
168
"cur" : cur_offset ,
151
- "max" : max_offset ,
169
+ "max" : end_offset ,
152
170
}
153
171
154
172
parts [i ].offset = cur_offset
0 commit comments