feat(vector): Automatic indexing of documents in s3 storage
This commit is contained in:
41
vector/chunk.py
Normal file
41
vector/chunk.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from typing import Optional
|
||||
from .client import EmbeddedClient
|
||||
|
||||
class Chunk:
|
||||
def __init__(self, id: int, text: str):
|
||||
self.id = id
|
||||
self.text = text
|
||||
self.size = len(text)
|
||||
self.__embedding = None
|
||||
|
||||
def embed(self, client: EmbeddedClient) -> list[float]:
|
||||
self.__embedding = client.embed(self.text)
|
||||
return self.__embedding
|
||||
|
||||
@property
|
||||
def embedding(self) -> Optional[list[float]]:
|
||||
return self.__embedding
|
||||
|
||||
@property
|
||||
def has_embedding(self) -> bool:
|
||||
return self.__embedding is not None
|
||||
|
||||
class Chunks:
|
||||
def __init__(self, size: int = 5000, overlap: int = 200) -> None:
|
||||
self.size = size
|
||||
self.overlap = overlap
|
||||
self.__count = 0
|
||||
|
||||
def chunk(self, text: str) -> list[Chunk]:
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(text):
|
||||
chunk_id = len(chunks) + 1
|
||||
chunks.append(Chunk(id=chunk_id, text=text[i:i+self.size]))
|
||||
i += self.size - self.overlap
|
||||
self.__count = len(chunks)
|
||||
return chunks
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return self.__count
|
||||
Reference in New Issue
Block a user