feat(vector): Automatic indexing of documents in s3 storage

This commit is contained in:
Björn Benouarets
2026-01-30 22:38:08 +01:00
commit e60eb297fe
15 changed files with 1026 additions and 0 deletions

41
vector/chunk.py Normal file
View File

@@ -0,0 +1,41 @@
from typing import Optional
from .client import EmbeddedClient
class Chunk:
def __init__(self, id: int, text: str):
self.id = id
self.text = text
self.size = len(text)
self.__embedding = None
def embed(self, client: EmbeddedClient) -> list[float]:
self.__embedding = client.embed(self.text)
return self.__embedding
@property
def embedding(self) -> Optional[list[float]]:
return self.__embedding
@property
def has_embedding(self) -> bool:
return self.__embedding is not None
class Chunks:
def __init__(self, size: int = 5000, overlap: int = 200) -> None:
self.size = size
self.overlap = overlap
self.__count = 0
def chunk(self, text: str) -> list[Chunk]:
chunks = []
i = 0
while i < len(text):
chunk_id = len(chunks) + 1
chunks.append(Chunk(id=chunk_id, text=text[i:i+self.size]))
i += self.size - self.overlap
self.__count = len(chunks)
return chunks
@property
def count(self) -> int:
return self.__count