wip
This commit is contained in:
83
batch-jobs/app/csv_utils.py
Normal file
83
batch-jobs/app/csv_utils.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import csv
|
||||
from typing import TextIO
|
||||
|
||||
from smart_open import open
|
||||
|
||||
|
||||
def byte_ranges(
|
||||
csvfile: str,
|
||||
chunk_size: int = 100,
|
||||
**kwargs,
|
||||
) -> list[tuple[int, int]]:
|
||||
"""Compute byte ranges for reading a CSV file in fixed-size line chunks.
|
||||
|
||||
Returns pairs (start_byte, end_byte) for each fixed-size group of lines.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
csvfile : str
|
||||
Path to the CSV file, opened in binary mode internally.
|
||||
chunk_size : int, optional
|
||||
Number of lines per chunk. Default is 100.
|
||||
**kwargs :
|
||||
Extra options passed to `open()`, e.g., buffering.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of tuple[int, int]
|
||||
Byte ranges covering each chunk of lines.
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> byte_ranges("users.csv", chunk_size=500)
|
||||
[(0, 3125), (3126, 6150), (6151, 9124)]
|
||||
"""
|
||||
line_offsets = [0]
|
||||
|
||||
with open(csvfile, 'rb', **kwargs) as fp:
|
||||
while True:
|
||||
if not fp.readline():
|
||||
break
|
||||
line_offsets.append(fp.tell())
|
||||
|
||||
total_lines = len(line_offsets) - 1
|
||||
byte_ranges = []
|
||||
|
||||
for start_line in range(1, total_lines + 1, chunk_size):
|
||||
# Calculate the end line index, bounded by total lines
|
||||
end_line = min(start_line + chunk_size - 1, total_lines)
|
||||
# Get byte range for this chunk
|
||||
start_byte = line_offsets[start_line - 1]
|
||||
end_byte = line_offsets[end_line] - 1
|
||||
|
||||
byte_ranges.append((start_byte, end_byte))
|
||||
|
||||
return byte_ranges
|
||||
|
||||
|
||||
def detect_delimiter(sample: TextIO) -> str:
|
||||
"""Detect the delimiter character used in a CSV file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : TextIO
|
||||
A file-like object opened in text mode (e.g., from `open('file.csv')`).
|
||||
Must be readable and at position 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
The detected delimiter character (e.g., ',', ';', '\\t').
|
||||
|
||||
Raises
|
||||
------
|
||||
csv.Error
|
||||
If the file cannot be parsed as CSV or delimiter detection fails.
|
||||
ValueError
|
||||
If the file is empty or contains no detectable delimiter.
|
||||
"""
|
||||
sniffer = csv.Sniffer()
|
||||
dialect = sniffer.sniff(sample.read())
|
||||
sample.seek(0)
|
||||
|
||||
return dialect.delimiter
|
||||
0
batch-jobs/app/events/__init__.py
Normal file
0
batch-jobs/app/events/__init__.py
Normal file
20
batch-jobs/app/events/csv_chunks.py
Normal file
20
batch-jobs/app/events/csv_chunks.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import boto3
|
||||
from aws_lambda_powertools.utilities.data_classes import (
|
||||
EventBridgeEvent,
|
||||
event_source,
|
||||
)
|
||||
from aws_lambda_powertools.utilities.typing import LambdaContext
|
||||
|
||||
from csv_utils import byte_ranges
|
||||
|
||||
CHUNK_SIZE = 50
|
||||
s3_client = boto3.client('s3')
|
||||
|
||||
|
||||
@event_source(data_class=EventBridgeEvent)
|
||||
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
|
||||
new_image = event.detail['new_image']
|
||||
csvfile = new_image['csv_s3uri']
|
||||
pairs = byte_ranges(csvfile, CHUNK_SIZE)
|
||||
|
||||
return True
|
||||
47
batch-jobs/app/events/read_chunk.py
Normal file
47
batch-jobs/app/events/read_chunk.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import csv
|
||||
from io import StringIO
|
||||
|
||||
import boto3
|
||||
from aws_lambda_powertools.utilities.data_classes import (
|
||||
EventBridgeEvent,
|
||||
event_source,
|
||||
)
|
||||
from aws_lambda_powertools.utilities.typing import LambdaContext
|
||||
|
||||
from csv_utils import byte_ranges
|
||||
|
||||
CHUNK_SIZE = 50
|
||||
s3_client = boto3.client('s3')
|
||||
|
||||
|
||||
@event_source(data_class=EventBridgeEvent)
|
||||
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
|
||||
new_image = event.detail['new_image']
|
||||
csvfile = new_image['csv_s3uri']
|
||||
*_, pair = byte_ranges(csvfile, CHUNK_SIZE)
|
||||
|
||||
data = get_object_range(csvfile, pair[0], pair[1], s3_client=s3_client)
|
||||
reader = csv.reader(data)
|
||||
|
||||
for x in reader:
|
||||
print(x)
|
||||
return True
|
||||
|
||||
|
||||
def get_object_range(
|
||||
s3_uri: str,
|
||||
start_byte: int,
|
||||
end_byte: int = -1,
|
||||
*,
|
||||
s3_client,
|
||||
) -> StringIO:
|
||||
bucket, key = s3_uri.replace('s3://', '').split('/', 1)
|
||||
range_ = f'bytes={start_byte}-{end_byte}' if end_byte else f'bytes={start_byte}-'
|
||||
|
||||
response = s3_client.get_object(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
Range=range_,
|
||||
)
|
||||
|
||||
return StringIO(response['Body'].read().decode('utf-8'))
|
||||
Reference in New Issue
Block a user