This commit is contained in:
2025-07-02 20:09:04 -03:00
parent 9b927bdbcd
commit 238c215f76
78 changed files with 10075 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
import os
import boto3
def get_dynamodb_client():
if os.getenv('AWS_LAMBDA_FUNCTION_NAME'):
return boto3.client('dynamodb')
return boto3.client('dynamodb', endpoint_url='http://127.0.0.1:8000')
dynamodb_client = get_dynamodb_client()
s3_client = boto3.client('s3')

View File

@@ -0,0 +1,4 @@
import os
USER_TABLE: str = os.getenv('USER_TABLE') # type: ignore
CHUNK_SIZE = 50

View File

@@ -0,0 +1,83 @@
import csv
from typing import TextIO
from smart_open import open
def byte_ranges(
csvfile: str,
chunk_size: int = 100,
**kwargs,
) -> list[tuple[int, int]]:
"""Compute byte ranges for reading a CSV file in fixed-size line chunks.
Returns pairs (start_byte, end_byte) for each fixed-size group of lines.
Parameters
----------
csvfile : str
Path to the CSV file, opened in binary mode internally.
chunk_size : int, optional
Number of lines per chunk. Default is 100.
**kwargs :
Extra options passed to `open()`, e.g., buffering.
Returns
-------
list of tuple[int, int]
Byte ranges covering each chunk of lines.
Example
-------
>>> byte_ranges("users.csv", chunk_size=500)
[(0, 3125), (3126, 6150), (6151, 9124)]
"""
line_offsets = [0]
with open(csvfile, 'rb', **kwargs) as fp:
while True:
if not fp.readline():
break
line_offsets.append(fp.tell())
total_lines = len(line_offsets) - 1
byte_ranges = []
for start_line in range(1, total_lines + 1, chunk_size):
# Calculate the end line index, bounded by total lines
end_line = min(start_line + chunk_size - 1, total_lines)
# Get byte range for this chunk
start_byte = line_offsets[start_line - 1]
end_byte = line_offsets[end_line] - 1
byte_ranges.append((start_byte, end_byte))
return byte_ranges
def detect_delimiter(sample: TextIO) -> str:
"""Detect the delimiter character used in a CSV file.
Parameters
----------
sample : TextIO
A file-like object opened in text mode (e.g., from `open('file.csv')`).
Must be readable and at position 0.
Returns
-------
str
The detected delimiter character (e.g., ',', ';', '\\t').
Raises
------
csv.Error
If the file cannot be parsed as CSV or delimiter detection fails.
ValueError
If the file is empty or contains no detectable delimiter.
"""
sniffer = csv.Sniffer()
dialect = sniffer.sniff(sample.read())
sample.seek(0)
return dialect.delimiter

View File

View File

@@ -0,0 +1,20 @@
from aws_lambda_powertools.utilities.data_classes import (
EventBridgeEvent,
event_source,
)
from aws_lambda_powertools.utilities.typing import LambdaContext
from boto3clients import s3_client
from config import CHUNK_SIZE
from csv_utils import byte_ranges
transport_params = {'client': s3_client}
@event_source(data_class=EventBridgeEvent)
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
new_image = event.detail['new_image']
csvfile = new_image['s3uri']
pairs = byte_ranges(csvfile, CHUNK_SIZE, transport_params=transport_params)
return True

View File

@@ -0,0 +1,14 @@
from aws_lambda_powertools.utilities.data_classes import (
EventBridgeEvent,
event_source,
)
from aws_lambda_powertools.utilities.typing import LambdaContext
from boto3clients import s3_client
transport_params = {'client': s3_client}
@event_source(data_class=EventBridgeEvent)
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
return True

View File

@@ -0,0 +1,55 @@
import csv
from io import StringIO
from typing import TYPE_CHECKING
from aws_lambda_powertools.utilities.data_classes import (
EventBridgeEvent,
event_source,
)
from aws_lambda_powertools.utilities.typing import LambdaContext
from boto3clients import s3_client
if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client
else:
S3Client = object
transport_params = {'client': s3_client}
@event_source(data_class=EventBridgeEvent)
def lambda_handler(event: EventBridgeEvent, context: LambdaContext) -> bool:
new_image = event.detail['new_image']
csvfile = new_image['s3_uri']
data = _get_s3_object_range(
csvfile,
start_byte=new_image['start_byte'],
end_byte=new_image['end_byte'],
s3_client=s3_client,
)
reader = csv.reader(data)
for x in reader:
print(x)
return True
def _get_s3_object_range(
s3_uri: str,
*,
start_byte: int,
end_byte: int,
s3_client: S3Client,
) -> StringIO:
bucket, key = s3_uri.replace('s3://', '').split('/', 1)
response = s3_client.get_object(
Bucket=bucket,
Key=key,
Range=f'bytes={start_byte}-{end_byte}',
)
return StringIO(response['Body'].read().decode('utf-8'))

View File

@@ -0,0 +1,40 @@
import urllib.parse as urllib_parse
from email.utils import parseaddr
from aws_lambda_powertools import Logger
from aws_lambda_powertools.utilities.data_classes import SESEvent, event_source
from aws_lambda_powertools.utilities.typing import LambdaContext
from layercake.dynamodb import DynamoDBPersistenceLayer, KeyPair, SortKey
from boto3clients import dynamodb_client
from config import USER_TABLE
from ses_utils import get_header_value
logger = Logger(__name__)
user_layer = DynamoDBPersistenceLayer(USER_TABLE, dynamodb_client)
@logger.inject_lambda_context
@event_source(data_class=SESEvent)
def lambda_handler(event: SESEvent, context: LambdaContext) -> dict:
ses = event.record.ses
to = urllib_parse.unquote(ses.receipt.recipients[0]).lower()
name, email_from = parseaddr(get_header_value(ses.mail.headers, 'from'))
org_id = user_layer.collection.get_item(
KeyPair('email', SortKey(to, path_spec='user_id')),
raise_on_error=False,
default={},
)
if not org_id:
return {'disposition': 'STOP_RULE_SET'}
print(
{
'id': f'mailbox#{org_id}',
'sk': ses.mail.message_id,
}
)
return {'disposition': 'CONTINUE'}

View File

@@ -0,0 +1,20 @@
from typing import Any, Iterator
from aws_lambda_powertools.utilities.data_classes.ses_event import SESMailHeader
def get_header_value(
headers: Iterator[SESMailHeader],
header_name: str,
*,
default: Any = None,
raise_on_missing: bool = True,
) -> str:
for header in headers:
if header.name.lower() == header_name:
return header.value
if raise_on_missing:
raise ValueError(f'{header_name} not found.')
return default