@@ -163,15 +163,22 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
163163 span .set_tag ("document.modified" , document .modified )
164164 span .set_tag ("document.action" , "crashed" )
165165
166+ # == DOCUMENT EFFECTIVE
167+
168+ # Get the document's effective date using the subclassed method
169+ # This may return none for documents without an effective date
170+ # If this fails, we can't do anything other than to skip the document
171+ effective = self .get_document_effective (document )
172+
166173 # == DOCUMENT RECORD (GET)
167174
168175 # Try to find an existing document record
169- record = self .retrieve_document (document )
176+ record = self .retrieve_document (document , effective )
170177
171178 # == DOCUMENT PROCESSING
172179
173180 # Get the modified time if it is set, otherwise use the current time
174- created = document .created or datetime .datetime .utcnow ( )
181+ created = document .created or datetime .datetime .now ( datetime . timezone . utc )
175182 modified = document .modified or created
176183
177184 # Check if the document has changed without downloading it and comparing hashes
@@ -193,8 +200,8 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
193200 # If this fails, we can't do anything other than to skip the document
194201 stream , new_hash = self .download_document (document )
195202
196- # Check if the document hash has changed
197- if record and record .parsed and record .hash == new_hash :
203+ # Check if the document hash or document URL have changed
204+ if record and record .parsed and record .hash == new_hash and record . url == document . url :
198205 changed = False
199206 else :
200207 action = "updated"
@@ -233,11 +240,6 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
233240
234241 return
235242
236- # Get the document's effective date using the subclassed method
237- # This may return none for documents without an effective date
238- # If this fails, we can't do anything other than to skip the document
239- effective = self .get_document_effective (document )
240-
241243 if parsable :
242244 # If there is no date, we can't do anything other than to skip the document
243245 if not effective :
@@ -320,14 +322,17 @@ def handle_document(self, document: DocumentInfo, span: Span) -> None:
320322 self .logger .info ("Skipped because the %s document for %s is already stored" , document .type .value , effective )
321323 # fmt: on
322324
323- def retrieve_document (self , document : DocumentInfo ) -> Document | None :
325+ def retrieve_document (self , document : DocumentInfo , effective : datetime . date | None ) -> Document | None :
324326 """Get a document record from the database. May be set by subclasses."""
325327
326- return (
327- self .session .query (Document )
328- .filter (Document .type == document .type , Document .url == document .url )
329- .first ()
330- )
328+ # Normally, the document URL should match
329+ criterion = Document .url == document .url
330+
331+ if effective :
332+ # If effective date is set, it may also match instead of the URL
333+ criterion |= Document .effective == effective
334+
335+ return self .session .query (Document ).filter (Document .type == document .type , criterion ).first ()
331336
332337 @with_span (op = "download" )
333338 def download_document (self , document : DocumentInfo ) -> tuple [BytesIO , str ]:
0 commit comments