@@ -489,6 +489,43 @@ class SpeechModel(str, Enum):
489489 "The model optimized for accuracy, low latency, ease of use, and multi-language support"
490490
491491
492+ class SpeakerOptions (BaseModel ):
493+ """
494+ Speaker options for controlling speaker diarization parameters
495+ """
496+
497+ min_speakers_expected : Optional [int ] = Field (
498+ None , ge = 1 , description = "Minimum number of speakers expected in the audio"
499+ )
500+ max_speakers_expected : Optional [int ] = Field (
501+ None , ge = 1 , description = "Maximum number of speakers expected in the audio"
502+ )
503+
504+ if pydantic_v2 :
505+
506+ @field_validator ("max_speakers_expected" )
507+ @classmethod
508+ def validate_max_speakers (cls , v , info ):
509+ if v is not None and info .data .get ("min_speakers_expected" ) is not None :
510+ min_speakers = info .data ["min_speakers_expected" ]
511+ if v < min_speakers :
512+ raise ValueError (
513+ "max_speakers_expected must be greater than or equal to min_speakers_expected"
514+ )
515+ return v
516+ else :
517+
518+ @validator ("max_speakers_expected" )
519+ def validate_max_speakers (cls , v , values ):
520+ if v is not None and values .get ("min_speakers_expected" ) is not None :
521+ min_speakers = values ["min_speakers_expected" ]
522+ if v < min_speakers :
523+ raise ValueError (
524+ "max_speakers_expected must be greater than or equal to min_speakers_expected"
525+ )
526+ return v
527+
528+
492529class RawTranscriptionConfig (BaseModel ):
493530 language_code : Optional [Union [str , LanguageCode ]] = None
494531 """
@@ -546,6 +583,9 @@ class RawTranscriptionConfig(BaseModel):
546583 speakers_expected : Optional [int ] = None
547584 "The number of speakers you expect to be in your audio file."
548585
586+ speaker_options : Optional [SpeakerOptions ] = None
587+ "Advanced options for controlling speaker diarization parameters."
588+
549589 content_safety : Optional [bool ] = None
550590 "Enable Content Safety Detection."
551591
@@ -633,6 +673,7 @@ def __init__(
633673 redact_pii_sub : Optional [PIISubstitutionPolicy ] = None ,
634674 speaker_labels : Optional [bool ] = None ,
635675 speakers_expected : Optional [int ] = None ,
676+ speaker_options : Optional [SpeakerOptions ] = None ,
636677 content_safety : Optional [bool ] = None ,
637678 content_safety_confidence : Optional [int ] = None ,
638679 iab_categories : Optional [bool ] = None ,
@@ -675,6 +716,7 @@ def __init__(
675716 redact_pii_sub: The replacement logic for detected PII.
676717 speaker_labels: Enable Speaker Diarization.
677718 speakers_expected: The number of speakers you expect to hear in your audio file. Up to 10 speakers are supported.
719+ speaker_options: Advanced options for controlling speaker diarization parameters, including min and max speakers expected.
678720 content_safety: Enable Content Safety Detection.
679721 iab_categories: Enable Topic Detection.
680722 custom_spelling: Customize how words are spelled and formatted using to and from values.
@@ -722,7 +764,7 @@ def __init__(
722764 redact_pii_policies ,
723765 redact_pii_sub ,
724766 )
725- self .set_speaker_diarization (speaker_labels , speakers_expected )
767+ self .set_speaker_diarization (speaker_labels , speakers_expected , speaker_options )
726768 self .set_content_safety (content_safety , content_safety_confidence )
727769 self .iab_categories = iab_categories
728770 self .set_custom_spelling (custom_spelling , override = True )
@@ -934,6 +976,12 @@ def speakers_expected(self) -> Optional[int]:
934976
935977 return self ._raw_transcription_config .speakers_expected
936978
979+ @property
980+ def speaker_options (self ) -> Optional [SpeakerOptions ]:
981+ "Returns the advanced speaker diarization options."
982+
983+ return self ._raw_transcription_config .speaker_options
984+
937985 @property
938986 def content_safety (self ) -> Optional [bool ]:
939987 "Returns the status of the Content Safety feature."
@@ -1162,21 +1210,32 @@ def set_speaker_diarization(
11621210 self ,
11631211 enable : Optional [bool ] = True ,
11641212 speakers_expected : Optional [int ] = None ,
1213+ speaker_options : Optional [SpeakerOptions ] = None ,
11651214 ) -> Self :
11661215 """
11671216 Whether to enable Speaker Diarization on the transcript.
11681217
11691218 Args:
11701219 `enable`: Enable Speaker Diarization
11711220 `speakers_expected`: The number of speakers in the audio file.
1221+ `speaker_options`: Advanced options for controlling speaker diarization parameters.
11721222 """
11731223
1174- if not enable :
1224+ # If enable is explicitly False, clear all speaker settings
1225+ if enable is False :
11751226 self ._raw_transcription_config .speaker_labels = None
11761227 self ._raw_transcription_config .speakers_expected = None
1228+ self ._raw_transcription_config .speaker_options = None
1229+ # If enable is True or None, set the values (allow setting speaker_options even when enable is None)
11771230 else :
1178- self ._raw_transcription_config .speaker_labels = True
1179- self ._raw_transcription_config .speakers_expected = speakers_expected
1231+ # Only set speaker_labels to True if enable is explicitly True
1232+ if enable is True :
1233+ self ._raw_transcription_config .speaker_labels = True
1234+ # Always set these if provided, regardless of enable value
1235+ if speakers_expected is not None :
1236+ self ._raw_transcription_config .speakers_expected = speakers_expected
1237+ if speaker_options is not None :
1238+ self ._raw_transcription_config .speaker_options = speaker_options
11801239
11811240 return self
11821241
@@ -1712,6 +1771,9 @@ class BaseTranscript(BaseModel):
17121771 speakers_expected : Optional [int ] = None
17131772 "The number of speakers you expect to be in your audio file."
17141773
1774+ speaker_options : Optional [SpeakerOptions ] = None
1775+ "Advanced options for controlling speaker diarization parameters."
1776+
17151777 content_safety : Optional [bool ] = None
17161778 "Enable Content Safety Detection."
17171779
0 commit comments