wip

2025-05-19 09:04:19 -03:00
parent 26b1e618b6
commit 7f4fec6e1e
33 changed files with 4728 additions and 542 deletions
--- a/batch-jobs/app/csv_utils.py
+++ b/batch-jobs/app/csv_utils.py
@@ -0,0 +1,83 @@
+import csv
+from typing import TextIO
+
+from smart_open import open
+
+
+def byte_ranges(
+    csvfile: str,
+    chunk_size: int = 100,
+    **kwargs,
+) -> list[tuple[int, int]]:
+    """Compute byte ranges for reading a CSV file in fixed-size line chunks.
+
+    Returns pairs (start_byte, end_byte) for each fixed-size group of lines.
+
+    Parameters
+    ----------
+    csvfile : str
+        Path to the CSV file, opened in binary mode internally.
+    chunk_size : int, optional
+        Number of lines per chunk. Default is 100.
+    **kwargs :
+        Extra options passed to `open()`, e.g., buffering.
+
+    Returns
+    -------
+    list of tuple[int, int]
+        Byte ranges covering each chunk of lines.
+
+    Example
+    -------
+    >>> byte_ranges("users.csv", chunk_size=500)
+    [(0, 3125), (3126, 6150), (6151, 9124)]
+    """
+    line_offsets = [0]
+
+    with open(csvfile, 'rb', **kwargs) as fp:
+        while True:
+            if not fp.readline():
+                break
+            line_offsets.append(fp.tell())
+
+    total_lines = len(line_offsets) - 1
+    byte_ranges = []
+
+    for start_line in range(1, total_lines + 1, chunk_size):
+        # Calculate the end line index, bounded by total lines
+        end_line = min(start_line + chunk_size - 1, total_lines)
+        # Get byte range for this chunk
+        start_byte = line_offsets[start_line - 1]
+        end_byte = line_offsets[end_line] - 1
+
+        byte_ranges.append((start_byte, end_byte))
+
+    return byte_ranges
+
+
+def detect_delimiter(sample: TextIO) -> str:
+    """Detect the delimiter character used in a CSV file.
+
+    Parameters
+    ----------
+    sample : TextIO
+        A file-like object opened in text mode (e.g., from `open('file.csv')`).
+        Must be readable and at position 0.
+
+    Returns
+    -------
+    str
+        The detected delimiter character (e.g., ',', ';', '\\t').
+
+    Raises
+    ------
+    csv.Error
+        If the file cannot be parsed as CSV or delimiter detection fails.
+    ValueError
+        If the file is empty or contains no detectable delimiter.
+    """
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(sample.read())
+    sample.seek(0)
+
+    return dialect.delimiter
--- a/batch-jobs/app/events/init.py
+++ b/batch-jobs/app/events/init.py
--- a/batch-jobs/app/events/csv_chunks.py
+++ b/batch-jobs/app/events/csv_chunks.py
@@ -0,0 +1,20 @@
+import boto3
+from aws_lambda_powertools.utilities.data_classes import (
+    EventBridgeEvent,
+    event_source,
+)
+from aws_lambda_powertools.utilities.typing import LambdaContext
+
+from csv_utils import byte_ranges
+
+CHUNK_SIZE = 50
+s3_client = boto3.client('s3')
+
+
+@event_source(data_class=EventBridgeEvent)
+def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
+    new_image = event.detail['new_image']
+    csvfile = new_image['csv_s3uri']
+    pairs = byte_ranges(csvfile, CHUNK_SIZE)
+
+    return True
--- a/batch-jobs/app/events/read_chunk.py
+++ b/batch-jobs/app/events/read_chunk.py
@@ -0,0 +1,47 @@
+import csv
+from io import StringIO
+
+import boto3
+from aws_lambda_powertools.utilities.data_classes import (
+    EventBridgeEvent,
+    event_source,
+)
+from aws_lambda_powertools.utilities.typing import LambdaContext
+
+from csv_utils import byte_ranges
+
+CHUNK_SIZE = 50
+s3_client = boto3.client('s3')
+
+
+@event_source(data_class=EventBridgeEvent)
+def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
+    new_image = event.detail['new_image']
+    csvfile = new_image['csv_s3uri']
+    *_, pair = byte_ranges(csvfile, CHUNK_SIZE)
+
+    data = get_object_range(csvfile, pair[0], pair[1], s3_client=s3_client)
+    reader = csv.reader(data)
+
+    for x in reader:
+        print(x)
+    return True
+
+
+def get_object_range(
+    s3_uri: str,
+    start_byte: int,
+    end_byte: int = -1,
+    *,
+    s3_client,
+) -> StringIO:
+    bucket, key = s3_uri.replace('s3://', '').split('/', 1)
+    range_ = f'bytes={start_byte}-{end_byte}' if end_byte else f'bytes={start_byte}-'
+
+    response = s3_client.get_object(
+        Bucket=bucket,
+        Key=key,
+        Range=range_,
+    )
+
+    return StringIO(response['Body'].read().decode('utf-8'))