diart#

Subpackages#

Submodules#

Package Contents#

Classes#

SpeakerDiarization

Represents a streaming audio pipeline

Pipeline

Represents a streaming audio pipeline

SpeakerDiarizationConfig

Configuration containing the required

PipelineConfig

Configuration containing the required

VoiceActivityDetection

Represents a streaming audio pipeline

VoiceActivityDetectionConfig

Configuration containing the required

class diart.SpeakerDiarization(config=None)#

Bases: diart.blocks.base.Pipeline

Represents a streaming audio pipeline

Parameters:

config (SpeakerDiarizationConfig | None) –

property config: SpeakerDiarizationConfig#
Return type:

SpeakerDiarizationConfig

static get_config_class()#
Return type:

type

static suggest_metric()#
Return type:

pyannote.metrics.base.BaseMetric

static hyper_parameters()#
Return type:

Sequence[diart.blocks.base.HyperParameter]

set_timestamp_shift(shift)#
Parameters:

shift (float) –

reset()#
__call__(waveforms)#

Diarize the next audio chunks of an audio stream.

Parameters:

waveforms (Sequence[SlidingWindowFeature]) – A sequence of consecutive audio chunks from an audio stream.

Returns:

Speaker diarization of each chunk alongside their corresponding audio.

Return type:

Sequence[tuple[Annotation, SlidingWindowFeature]]

class diart.Pipeline#

Bases: abc.ABC

Represents a streaming audio pipeline

abstract property config: PipelineConfig#
Return type:

PipelineConfig

abstract static get_config_class()#
Return type:

type

abstract static suggest_metric()#
Return type:

pyannote.metrics.base.BaseMetric

abstract static hyper_parameters()#
Return type:

Sequence[HyperParameter]

abstract reset()#
abstract set_timestamp_shift(shift)#
Parameters:

shift (float) –

abstract __call__(waveforms)#

Runs the next steps of the pipeline given a list of consecutive audio chunks.

Parameters:

waveforms (Sequence[SlidingWindowFeature]) – Consecutive chunk waveforms for the pipeline to ingest

Returns:

For each input waveform, a tuple containing the pipeline output and its respective audio

Return type:

Sequence[Tuple[Any, SlidingWindowFeature]]

class diart.SpeakerDiarizationConfig(segmentation=None, embedding=None, duration=5, step=0.5, latency=None, tau_active=0.6, rho_update=0.3, delta_new=1, gamma=3, beta=10, max_speakers=20, normalize_embedding_weights=False, device=None, sample_rate=16000, **kwargs)#

Bases: diart.blocks.base.PipelineConfig

Configuration containing the required parameters to build and run a pipeline

Parameters:
  • segmentation (diart.models.SegmentationModel | None) –

  • embedding (diart.models.EmbeddingModel | None) –

  • duration (float) –

  • step (float) –

  • latency (float | typing_extensions.Literal[max, min] | None) –

  • tau_active (float) –

  • rho_update (float) –

  • delta_new (float) –

  • gamma (float) –

  • beta (float) –

  • max_speakers (int) –

  • normalize_embedding_weights (bool) –

  • device (torch.device | None) –

  • sample_rate (int) –

property duration: float#

The duration of an input audio chunk (in seconds)

Return type:

float

property step: float#

The step between two consecutive input audio chunks (in seconds)

Return type:

float

property latency: float#

The algorithmic latency of the pipeline (in seconds). At time t of the audio stream, the pipeline will output predictions for time t - latency.

Return type:

float

property sample_rate: int#

The sample rate of the input audio stream

Return type:

int

class diart.PipelineConfig#

Bases: abc.ABC

Configuration containing the required parameters to build and run a pipeline

abstract property duration: float#

The duration of an input audio chunk (in seconds)

Return type:

float

abstract property step: float#

The step between two consecutive input audio chunks (in seconds)

Return type:

float

abstract property latency: float#

The algorithmic latency of the pipeline (in seconds). At time t of the audio stream, the pipeline will output predictions for time t - latency.

Return type:

float

abstract property sample_rate: int#

The sample rate of the input audio stream

Return type:

int

get_file_padding(filepath)#
Parameters:

filepath (diart.audio.FilePath) –

Return type:

Tuple[float, float]

class diart.VoiceActivityDetection(config=None)#

Bases: diart.blocks.base.Pipeline

Represents a streaming audio pipeline

Parameters:

config (VoiceActivityDetectionConfig | None) –

property config: diart.blocks.base.PipelineConfig#
Return type:

diart.blocks.base.PipelineConfig

static get_config_class()#
Return type:

type

static suggest_metric()#
Return type:

pyannote.metrics.base.BaseMetric

static hyper_parameters()#
Return type:

Sequence[diart.blocks.base.HyperParameter]

reset()#
set_timestamp_shift(shift)#
Parameters:

shift (float) –

__call__(waveforms)#

Runs the next steps of the pipeline given a list of consecutive audio chunks.

Parameters:

waveforms (Sequence[SlidingWindowFeature]) – Consecutive chunk waveforms for the pipeline to ingest

Returns:

For each input waveform, a tuple containing the pipeline output and its respective audio

Return type:

Sequence[Tuple[Any, SlidingWindowFeature]]

class diart.VoiceActivityDetectionConfig(segmentation=None, duration=5, step=0.5, latency=None, tau_active=0.6, device=None, sample_rate=16000, **kwargs)#

Bases: diart.blocks.base.PipelineConfig

Configuration containing the required parameters to build and run a pipeline

Parameters:
  • segmentation (diart.models.SegmentationModel | None) –

  • duration (float) –

  • step (float) –

  • latency (float | typing_extensions.Literal[max, min] | None) –

  • tau_active (float) –

  • device (torch.device | None) –

  • sample_rate (int) –

property duration: float#

The duration of an input audio chunk (in seconds)

Return type:

float

property step: float#

The step between two consecutive input audio chunks (in seconds)

Return type:

float

property latency: float#

The algorithmic latency of the pipeline (in seconds). At time t of the audio stream, the pipeline will output predictions for time t - latency.

Return type:

float

property sample_rate: int#

The sample rate of the input audio stream

Return type:

int