xtalk.model_types

Embeddings

Defined in langchain_core.embeddings.

from langchain_core.embeddings import Embeddings

External dependency re-exported by this module.

BaseChatModel

Defined in langchain_core.language_models.chat_models.

from langchain_core.language_models.chat_models import BaseChatModel

External dependency re-exported by this module.

Agent

Defined in xtalk.llm_agent.interfaces.

class Agent(ABC)

Abstract interface for conversational agents used by Xtalk.

Methods

content_to_text

Defined in xtalk.llm_agent.interfaces.

def content_to_text(content: Any) -> str

Normalize model content blocks into plain text.

Parameters

content: Content emitted by a LangChain model chunk or message.

Returns

str Plain-text content extracted from the input.

accept

Defined in xtalk.llm_agent.interfaces.

def accept(self, context: AgentContext) -> Iterable[AgentOutput]

Accept an incremental context update.

Parameters

context (AgentContext) Context payload forwarded from serving-layer events.

Yields

AgentStreamItem Zero or more streamed response items triggered by the context update.

async_accept

Defined in xtalk.llm_agent.interfaces.

async def async_accept(self, context: AgentContext) -> AsyncIterator[AgentOutput]

Asynchronously accept an incremental context update.

Parameters

context (AgentContext) Context payload forwarded from serving-layer events.

Yields

AgentStreamItem Streamed response items triggered by the context update.

sync_iter_from_async

Defined in xtalk.llm_agent.interfaces.

def sync_iter_from_async(self, async_iter: AsyncIterator[T]) -> Iterable[T]

Convert an async iterator into a synchronous generator.

Parameters

async_iter (AsyncIterator[T]) Async iterator to bridge into synchronous iteration.

Yields

T Items produced by async_iter.

clone

Defined in xtalk.llm_agent.interfaces.

def clone(self) -> 'Agent'

Clone the agent for a new session.

Returns

Agent Session-safe agent instance.

restore_history

Defined in xtalk.llm_agent.interfaces.

def restore_history(self, messages: list[dict[str, Any]]) -> None

Restore persisted conversation messages into the agent state.

Parameters

messages (list[dict[str, Any]]) Persisted chat messages ordered by session history.

get_chat_history

Defined in xtalk.llm_agent.interfaces.

def get_chat_history(self, with_system: bool = False) -> str | None

Return the serialized conversation history when available.

Parameters

with_system (bool, optional) Whether to include the system prompt message when supported by the concrete implementation.

Returns

str | None Conversation history or None.

add_tools

Defined in xtalk.llm_agent.interfaces.

def add_tools(self, tools: list[BaseTool | Callable[[], BaseTool]]) -> None

Attach tools to the agent.

Parameters

tools (list[BaseTool | Callable[[], BaseTool]]) Tool instances or factories that produce tool instances.

Rewriter

Defined in xtalk.rewriter.interfaces.

class Rewriter(ABC)

Abstract interface for text rewriting helpers.

Methods

rewrite

Defined in xtalk.rewriter.interfaces.

def rewrite(self, input: str) -> str

Rewrite input text.

Parameters

input (str) Source text to rewrite.

Returns

str Rewritten text.

async_rewrite

Defined in xtalk.rewriter.interfaces.

async def async_rewrite(self, input: str) -> str

Asynchronously rewrite input text.

Parameters

input (str) Source text to rewrite.

Returns

str Rewritten text.

ASR

Defined in xtalk.speech.interfaces.

class ASR(ABC)

Abstract interface for automatic speech recognition.

Methods

recognize

Defined in xtalk.speech.interfaces.

def recognize(self, audio: bytes) -> str

Recognize a full audio buffer.

Parameters

audio (bytes) PCM 16-bit mono audio bytes.

Returns

str Recognized text.

recognize_stream

Defined in xtalk.speech.interfaces.

def recognize_stream(self, audio: bytes, *, is_final: bool = False, chat_history: str | None = None) -> str

Recognize audio incrementally in streaming mode.

Parameters

audio (bytes) Incremental PCM 16-bit mono audio bytes.
is_final (bool, optional) Whether the caller is asking the ASR to treat the current point as a temporary boundary and optionally flush any tail audio that would otherwise remain buffered. This is only a decoding hint. It does not mean the streaming state must be reset, and previously recognized text for the session must be preserved so later audio can continue from the accumulated result.
chat_history (str | None, optional) Serialized chat history for the current session, excluding the in-progress turn when unavailable.

Returns

str Current recognition result.

stream_chunk_bytes_hint

Defined in xtalk.speech.interfaces.

def stream_chunk_bytes_hint(self) -> int | None

Return the preferred streaming chunk size.

Returns

int | None Recommended byte count for each chunk passed to recognize_stream, or None when no preference is provided.

reset

Defined in xtalk.speech.interfaces.

def reset(self) -> None

Reset internal recognition state.

clone

Defined in xtalk.speech.interfaces.

def clone(self) -> 'ASR'

Clone the ASR instance for a new session.

Returns

ASR Clone with shared weights and independent runtime state.

async_recognize

Defined in xtalk.speech.interfaces.

async def async_recognize(self, audio: bytes) -> str

Asynchronously recognize a full audio buffer.

Parameters

audio (bytes) PCM 16-bit mono audio bytes.

Returns

str Recognized text.

async_recognize_stream

Defined in xtalk.speech.interfaces.

async def async_recognize_stream(self, audio: bytes, *, is_final: bool = False, chat_history: str | None = None) -> str

Asynchronously recognize incremental audio input.

Parameters

audio (bytes) Incremental PCM 16-bit mono audio bytes.
is_final (bool, optional) Whether the caller is asking the ASR to treat the current point as a temporary boundary and optionally flush any tail audio that would otherwise remain buffered. This is only a decoding hint. It does not mean the streaming state must be reset, and previously recognized text for the session must be preserved so later audio can continue from the accumulated result.
chat_history (str | None, optional) Serialized chat history for the current session, excluding the in-progress turn when unavailable.

Returns

str Current recognition result.

TTS

Defined in xtalk.speech.interfaces.

class TTS(ABC)

Abstract base class for text-to-speech engines.

Notes

synthesize is the required baseline API for every implementation. Streaming-capable engines should additionally override synthesize_stream; non-streaming engines should inherit the default compatibility wrapper. The inherited streaming helpers do not by themselves declare native streaming capability.

Methods

synthesize

Defined in xtalk.speech.interfaces.

def synthesize(self, text: str) -> bytes

Synthesize audio for a full text input.

Parameters

text (str) Text to synthesize.

Returns

bytes PCM 16-bit mono audio bytes at 48 kHz.

Notes

Every TTS implementation, including streaming backends, must provide this method.

synthesize_stream

Defined in xtalk.speech.interfaces.

def synthesize_stream(self, text: str, **kwargs) -> Iterable[bytes]

Stream synthesized audio chunks for a text input.

Parameters

text (str) Text to synthesize.
**kwargs Model-specific streaming options.

Yields

bytes PCM 16-bit mono audio bytes at 48 kHz.

Notes

Override this method only when the backend supports native streaming synthesis. The default implementation yields a single chunk produced by synthesize for compatibility and should not be treated as a declaration of streaming support.

async_synthesize

Defined in xtalk.speech.interfaces.

async def async_synthesize(self, text: str, **kwargs: Any) -> bytes

Asynchronously synthesize audio for text.

Parameters

text (str) Text to synthesize.
**kwargs Model-specific synthesis options.

Returns

bytes Synthesized PCM audio bytes.

Notes

This method is an optional async optimization. Implementations may inherit the default executor-based wrapper.

async_synthesize_stream

Defined in xtalk.speech.interfaces.

async def async_synthesize_stream(self, text: str, **kwargs: Any) -> AsyncIterator[bytes]

Asynchronously stream synthesized audio chunks.

Parameters

text (str) Text to synthesize.
**kwargs Model-specific synthesis options.

Yields

bytes Streamed PCM audio chunks.

Notes

This method is an optional async optimization for streaming-capable backends. When not overridden, it asynchronously iterates over synthesize_stream.

clone

Defined in xtalk.speech.interfaces.

def clone(self) -> 'TTS'

Clone the TTS engine for a new session.

Returns

TTS Session-safe clone.

set_voice

Defined in xtalk.speech.interfaces.

def set_voice(self, voice_names: list[str]) -> None

Update the active voice selection.

Parameters

voice_names (list[str]) One or more voice names understood by the implementation.

set_emotion

Defined in xtalk.speech.interfaces.

def set_emotion(self, emotion: str | list[float]) -> None

Update the active synthesis emotion.

Parameters

emotion (str | list[float]) Emotion label or model-specific emotion vector.

Captioner

Defined in xtalk.speech.interfaces.

class Captioner(ABC)

Abstract base class for audio captioning models.

Methods

caption

Defined in xtalk.speech.interfaces.

def caption(self, audio: bytes) -> str

Generate a caption for audio.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

str Generated caption text.

caption_stream

Defined in xtalk.speech.interfaces.

def caption_stream(self, audio: bytes) -> Iterable[str]

Stream caption text for audio input.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Yields

str Streamed caption text.

async_caption

Defined in xtalk.speech.interfaces.

async def async_caption(self, audio: bytes) -> str

Asynchronously caption audio.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

str Generated caption text.

async_caption_stream

Defined in xtalk.speech.interfaces.

async def async_caption_stream(self, audio: bytes) -> AsyncIterator[str]

Asynchronously stream caption text.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Yields

str Streamed caption text.

PuntRestorer

Defined in xtalk.speech.interfaces.

class PuntRestorer(ABC)

Abstract base class for punctuation restoration models.

Methods

restore

Defined in xtalk.speech.interfaces.

def restore(self, text: str) -> str

Restore punctuation in text.

Parameters

text (str) Text without reliable punctuation.

Returns

str Text with restored punctuation.

async_restore

Defined in xtalk.speech.interfaces.

async def async_restore(self, text: str) -> str

Asynchronously restore punctuation in text.

Parameters

text (str) Text without reliable punctuation.

Returns

str Restored text.

VAD

Defined in xtalk.speech.interfaces.

class VAD(ABC)

Abstract base class for voice activity detection engines.

Methods

is_speech

Defined in xtalk.speech.interfaces.

def is_speech(self, frame: bytes) -> bool

Determine whether an audio frame contains speech.

Parameters

frame (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

bool True if speech is detected, otherwise False.

async_is_speech

Defined in xtalk.speech.interfaces.

async def async_is_speech(self, frame: bytes) -> bool

Asynchronously determine whether an audio frame contains speech.

Parameters

frame (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

bool True if speech is detected, otherwise False.

SpeechEnhancer

Defined in xtalk.speech.interfaces.

class SpeechEnhancer(ABC)

Abstract base class for speech enhancement engines.

Notes

Inputs and outputs use PCM 16-bit mono audio bytes at 16 kHz.

Methods

enhance

Defined in xtalk.speech.interfaces.

def enhance(self, audio: bytes) -> bytes

Enhance an audio frame.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

bytes Enhanced PCM audio bytes.

flush

Defined in xtalk.speech.interfaces.

def flush(self) -> bytes

Flush any internally buffered audio.

Returns

bytes Remaining enhanced PCM audio bytes.

async_enhance

Defined in xtalk.speech.interfaces.

async def async_enhance(self, audio: bytes) -> bytes

Asynchronously enhance audio.

Parameters

audio (bytes) PCM 16-bit mono audio bytes at 16 kHz.

Returns

bytes Enhanced PCM audio bytes.

async_flush

Defined in xtalk.speech.interfaces.

async def async_flush(self) -> bytes

Asynchronously flush buffered audio.

Returns

bytes Remaining enhanced PCM audio bytes.

reset

Defined in xtalk.speech.interfaces.

def reset(self) -> None

Reset internal buffers and caches.

clone

Defined in xtalk.speech.interfaces.

def clone(self) -> 'SpeechEnhancer'

Clone the speech enhancer for a new session.

Returns

SpeechEnhancer Clone with shared weights and isolated runtime state.

SpeakerEncoder

Defined in xtalk.speech.interfaces.

class SpeakerEncoder(ABC)

Abstract base class for speaker embedding models.

Methods

extract

Defined in xtalk.speech.interfaces.

def extract(self, audio: bytes) -> np.ndarray

Generate a speaker embedding vector.

Parameters

audio (bytes) PCM 16-bit mono audio bytes.

Returns

np.ndarray Speaker embedding vector.

async_extract

Defined in xtalk.speech.interfaces.

async def async_extract(self, audio: bytes) -> np.ndarray

Asynchronously extract a speaker embedding.

Parameters

audio (bytes) PCM 16-bit mono audio bytes.

Returns

np.ndarray Speaker embedding vector.

similarity

Defined in xtalk.speech.interfaces.

def similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float

Compute similarity between two speaker embeddings.

Parameters

embedding1 (np.ndarray) First speaker embedding.
embedding2 (np.ndarray) Second speaker embedding.

Returns

float Cosine similarity score.

SpeechSpeedController

Defined in xtalk.speech.interfaces.

class SpeechSpeedController(ABC)

Interface for TTS speed controllers.

Methods

process

Defined in xtalk.speech.interfaces.

def process(self, audio_bytes: bytes, speed: float = 1.0) -> bytes

Apply a speed adjustment to synthesized audio.

Parameters

audio_bytes (bytes) Synthesized audio bytes.
speed (float, optional) Speed multiplier.

Returns

bytes Processed audio bytes.

async_process

Defined in xtalk.speech.interfaces.

async def async_process(self, audio_bytes: bytes, speed: float = 1.0) -> bytes

Asynchronously apply a speed adjustment to audio.

Parameters

audio_bytes (bytes) Synthesized audio bytes.
speed (float, optional) Speed multiplier.

Returns

bytes Processed audio bytes.

TurnDetector

Defined in xtalk.speech.interfaces.

class TurnDetector(ABC)

Abstract interface for turn-taking detectors.

Methods

init

Defined in xtalk.speech.interfaces.

def __init__(self) -> None

listening

Defined in xtalk.speech.interfaces.

def listening(self) -> bool

Return whether the detector is currently listening for user turns.

Returns

bool Current listening state.

listening

Defined in xtalk.speech.interfaces.

def listening(self, value: bool) -> None

Update the listening state.

Parameters

value (bool) New listening state.

listening_lock

Defined in xtalk.speech.interfaces.

def listening_lock(self, is_async: bool = True)

Return the lock guarding listening state changes.

Parameters

is_async (bool, optional) Whether to return the async lock instead of the threading lock.

Returns

asyncio.Lock | threading.Lock Lock object matching the requested concurrency model.

detect

Defined in xtalk.speech.interfaces.

def detect(self, audio: Optional[bytes] = None, text: Optional[str] = None, speech_start: bool = False, speech_pause: Optional[bool] = None) -> TurnDetectionResult

Detect conversational turn state from audio and/or text.

Parameters

audio (bytes | None, optional) Current PCM 16-bit mono audio frame at 16 kHz.
text (str | None, optional) ASR text for the current turn.
speech_start (bool, optional) Whether VAD has just detected the start of speech. This may be provided without audio or text.
speech_pause (bool | None, optional) Whether the user appears to have paused speaking. This is typically provided together with text.

Returns

TurnDetectionResult Turn-detection decision for the current input.

async_detect

Defined in xtalk.speech.interfaces.

async def async_detect(self, audio: Optional[bytes] = None, text: Optional[str] = None, speech_start: bool = False, speech_pause: Optional[bool] = None) -> TurnDetectionResult

Asynchronously detect conversational turn state.

Parameters

audio (bytes | None, optional) Current PCM 16-bit mono audio frame at 16 kHz.
text (str | None, optional) ASR text for the current turn.
speech_start (bool, optional) Whether VAD has just detected the start of speech. This may be provided without audio or text.
speech_pause (bool | None, optional) Whether the user appears to have paused speaking.

Returns

TurnDetectionResult Turn-detection decision for the current input.

clone

Defined in xtalk.speech.interfaces.

def clone(self) -> 'TurnDetector'

Clone the turn detector for a new session.

Returns

TurnDetector Session-safe clone.