41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
from typing import Optional
|
|
from .client import EmbeddedClient
|
|
|
|
class Chunk:
|
|
def __init__(self, id: int, text: str):
|
|
self.id = id
|
|
self.text = text
|
|
self.size = len(text)
|
|
self.__embedding = None
|
|
|
|
def embed(self, client: EmbeddedClient) -> list[float]:
|
|
self.__embedding = client.embed(self.text)
|
|
return self.__embedding
|
|
|
|
@property
|
|
def embedding(self) -> Optional[list[float]]:
|
|
return self.__embedding
|
|
|
|
@property
|
|
def has_embedding(self) -> bool:
|
|
return self.__embedding is not None
|
|
|
|
class Chunks:
|
|
def __init__(self, size: int = 5000, overlap: int = 200) -> None:
|
|
self.size = size
|
|
self.overlap = overlap
|
|
self.__count = 0
|
|
|
|
def chunk(self, text: str) -> list[Chunk]:
|
|
chunks = []
|
|
i = 0
|
|
while i < len(text):
|
|
chunk_id = len(chunks) + 1
|
|
chunks.append(Chunk(id=chunk_id, text=text[i:i+self.size]))
|
|
i += self.size - self.overlap
|
|
self.__count = len(chunks)
|
|
return chunks
|
|
|
|
@property
|
|
def count(self) -> int:
|
|
return self.__count |