diff --git a/google/cloud/dialogflow/v2/BUILD.bazel b/google/cloud/dialogflow/v2/BUILD.bazel index db1e107ae486f..6863abba14bbe 100644 --- a/google/cloud/dialogflow/v2/BUILD.bazel +++ b/google/cloud/dialogflow/v2/BUILD.bazel @@ -396,7 +396,7 @@ load( csharp_proto_library( name = "dialogflow_csharp_proto", - extra_opts = [], + extra_opts = [""], deps = [":dialogflow_proto"], ) diff --git a/google/cloud/dialogflow/v2/audio_config.proto b/google/cloud/dialogflow/v2/audio_config.proto index d45dbc8fdbcd9..bcb035e5a89f9 100644 --- a/google/cloud/dialogflow/v2/audio_config.proto +++ b/google/cloud/dialogflow/v2/audio_config.proto @@ -36,55 +36,6 @@ option (google.api.resource_definition) = { pattern: "projects/{project}/locations/{location}/phraseSets/{phrase_set}" }; -// Audio encoding of the audio content sent in the conversational query request. -// Refer to the -// [Cloud Speech API -// documentation](https://cloud.google.com/speech-to-text/docs/basics) for more -// details. -enum AudioEncoding { - // Not specified. - AUDIO_ENCODING_UNSPECIFIED = 0; - - // Uncompressed 16-bit signed little-endian samples (Linear PCM). - AUDIO_ENCODING_LINEAR_16 = 1; - - // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio - // Codec) is the recommended encoding because it is lossless (therefore - // recognition is not compromised) and requires only about half the - // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and - // 24-bit samples, however, not all fields in `STREAMINFO` are supported. - AUDIO_ENCODING_FLAC = 2; - - // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. - AUDIO_ENCODING_MULAW = 3; - - // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. - AUDIO_ENCODING_AMR = 4; - - // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. - AUDIO_ENCODING_AMR_WB = 5; - - // Opus encoded audio frames in Ogg container - // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // `sample_rate_hertz` must be 16000. - AUDIO_ENCODING_OGG_OPUS = 6; - - // Although the use of lossy encodings is not recommended, if a very low - // bitrate encoding is required, `OGG_OPUS` is highly preferred over - // Speex encoding. The [Speex](https://speex.org/) encoding supported by - // Dialogflow API has a header byte in each block, as in MIME type - // `audio/x-speex-with-header-byte`. - // It is a variant of the RTP Speex encoding defined in - // [RFC 5574](https://tools.ietf.org/html/rfc5574). - // The stream is a sequence of blocks, one block per RTP packet. Each block - // starts with a byte containing the length of the block, in bytes, followed - // by one or more frames of Speex data, padded to an integral number of - // bytes (octets) as specified in RFC 5574. In other words, each RTP header - // is replaced with a single byte containing the block length. Only Speex - // wideband is supported. `sample_rate_hertz` must be 16000. - AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; -} - // Hints for the speech recognizer to help with recognition in a specific // conversation state. message SpeechContext { @@ -101,7 +52,7 @@ message SpeechContext { // See the [Cloud Speech // documentation](https://cloud.google.com/speech-to-text/quotas) for usage // limits. - repeated string phrases = 1; + repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Boost for this context compared to other contexts: // @@ -112,51 +63,7 @@ message SpeechContext { // // Dialogflow recommends that you use boosts in the range (0, 20] and that you // find a value that fits your use case with binary search. - float boost = 2; -} - -// Variant of the specified [Speech -// model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use. -// -// See the [Cloud Speech -// documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) -// for which models have different variants. For example, the "phone_call" model -// has both a standard and an enhanced variant. When you use an enhanced model, -// you will generally receive higher quality results than for a standard model. -enum SpeechModelVariant { - // No model variant specified. In this case Dialogflow defaults to - // USE_BEST_AVAILABLE. - SPEECH_MODEL_VARIANT_UNSPECIFIED = 0; - - // Use the best available variant of the [Speech - // model][InputAudioConfig.model] that the caller is eligible for. - // - // Please see the [Dialogflow - // docs](https://cloud.google.com/dialogflow/docs/data-logging) for - // how to make your project eligible for enhanced models. - USE_BEST_AVAILABLE = 1; - - // Use standard model variant even if an enhanced model is available. See the - // [Cloud Speech - // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) - // for details about enhanced models. - USE_STANDARD = 2; - - // Use an enhanced model variant: - // - // * If an enhanced variant does not exist for the given - // [model][google.cloud.dialogflow.v2.InputAudioConfig.model] and request - // language, Dialogflow falls back to the standard variant. - // - // The [Cloud Speech - // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) - // describes which models have enhanced variants. - // - // * If the API caller isn't eligible for enhanced models, Dialogflow returns - // an error. Please see the [Dialogflow - // docs](https://cloud.google.com/dialogflow/docs/data-logging) - // for how to make your project eligible. - USE_ENHANCED = 3; + float boost = 2 [(google.api.field_behavior) = OPTIONAL]; } // Information for a word recognized by the speech recognizer. @@ -187,21 +94,20 @@ message SpeechWordInfo { // Instructs the speech recognizer how to process the audio content. message InputAudioConfig { // Required. Audio encoding of the audio content to process. - AudioEncoding audio_encoding = 1; + AudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Sample rate (in Hertz) of the audio content sent in the query. - // Refer to - // [Cloud Speech API + // Refer to [Cloud Speech API // documentation](https://cloud.google.com/speech-to-text/docs/basics) for // more details. - int32 sample_rate_hertz = 2; + int32 sample_rate_hertz = 2 [(google.api.field_behavior) = REQUIRED]; // Required. The language of the supplied audio. Dialogflow does not do // translations. See [Language // Support](https://cloud.google.com/dialogflow/docs/reference/language) // for a list of the currently supported language codes. Note that queries in // the same session do not necessarily need to specify the same language. - string language_code = 3; + string language_code = 3 [(google.api.field_behavior) = REQUIRED]; // If `true`, Dialogflow returns // [SpeechWordInfo][google.cloud.dialogflow.v2.SpeechWordInfo] in @@ -277,29 +183,12 @@ message InputAudioConfig { bool enable_automatic_punctuation = 17; } -// Gender of the voice as described in -// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). -enum SsmlVoiceGender { - // An unspecified gender, which means that the client doesn't care which - // gender the selected voice will have. - SSML_VOICE_GENDER_UNSPECIFIED = 0; - - // A male voice. - SSML_VOICE_GENDER_MALE = 1; - - // A female voice. - SSML_VOICE_GENDER_FEMALE = 2; - - // A gender-neutral voice. - SSML_VOICE_GENDER_NEUTRAL = 3; -} - // Description of which voice to use for speech synthesis. message VoiceSelectionParams { // Optional. The name of the voice. If not set, the service will choose a // voice based on the other parameters such as language_code and // [ssml_gender][google.cloud.dialogflow.v2.VoiceSelectionParams.ssml_gender]. - string name = 1; + string name = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. The preferred gender of the voice. If not set, the service will // choose a voice based on the other parameters such as language_code and @@ -307,21 +196,21 @@ message VoiceSelectionParams { // this is only a preference, not requirement. If a voice of the appropriate // gender is not available, the synthesizer should substitute a voice with a // different gender rather than failing the request. - SsmlVoiceGender ssml_gender = 2; + SsmlVoiceGender ssml_gender = 2 [(google.api.field_behavior) = OPTIONAL]; } // Configuration of how speech should be synthesized. message SynthesizeSpeechConfig { // Optional. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal - // native speed supported by the specific voice. 2.0 is twice as fast, and - // 0.5 is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any - // other values < 0.25 or > 4.0 will return an error. - double speaking_rate = 1; + // native speed supported by the specific voice. 2.0 is twice as fast, and 0.5 + // is half as fast. If unset(0.0), defaults to the native 1.0 speed. Any other + // values < 0.25 or > 4.0 will return an error. + double speaking_rate = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Speaking pitch, in the range [-20.0, 20.0]. 20 means increase 20 // semitones from the original pitch. -20 means decrease 20 semitones from the // original pitch. - double pitch = 2; + double pitch = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Volume gain (in dB) of the normal native volume supported by the // specific voice, in the range [-96.0, 16.0]. If unset, or set to a value of @@ -331,40 +220,16 @@ message SynthesizeSpeechConfig { // amplitude of the normal native signal amplitude. We strongly recommend not // to exceed +10 (dB) as there's usually no effective increase in loudness for // any value greater than that. - double volume_gain_db = 3; + double volume_gain_db = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. An identifier which selects 'audio effects' profiles that are // applied on (post synthesized) text to speech. Effects are applied on top of // each other in the order they are given. - repeated string effects_profile_id = 5; + repeated string effects_profile_id = 5 + [(google.api.field_behavior) = OPTIONAL]; // Optional. The desired voice of the synthesized audio. - VoiceSelectionParams voice = 4; -} - -// Audio encoding of the output audio format in Text-To-Speech. -enum OutputAudioEncoding { - // Not specified. - OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0; - - // Uncompressed 16-bit signed little-endian samples (Linear PCM). - // Audio content returned as LINEAR16 also contains a WAV header. - OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1; - - // MP3 audio at 32kbps. - OUTPUT_AUDIO_ENCODING_MP3 = 2; - - // MP3 audio at 64kbps. - OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4; - - // Opus encoded audio wrapped in an ogg container. The result will be a - // file which can be played natively on Android, and in browsers (at least - // Chrome and Firefox). The quality of the encoding is considerably higher - // than MP3 while using approximately the same bitrate. - OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3; - - // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. - OUTPUT_AUDIO_ENCODING_MULAW = 5; + VoiceSelectionParams voice = 4 [(google.api.field_behavior) = OPTIONAL]; } // Instructs the speech synthesizer on how to generate the output audio content. @@ -386,6 +251,36 @@ message OutputAudioConfig { SynthesizeSpeechConfig synthesize_speech_config = 3; } +// A wrapper of repeated TelephonyDtmf digits. +message TelephonyDtmfEvents { + // A sequence of TelephonyDtmf digits. + repeated TelephonyDtmf dtmf_events = 1; +} + +// Configures speech transcription for +// [ConversationProfile][google.cloud.dialogflow.v2.ConversationProfile]. +message SpeechToTextConfig { + // The speech model used in speech to text. + // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as + // `USE_ENHANCED`. It can be overridden in + // [AnalyzeContentRequest][google.cloud.dialogflow.v2.AnalyzeContentRequest] + // and + // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2.StreamingAnalyzeContentRequest] + // request. If enhanced model variant is specified and an enhanced version of + // the specified model for the language does not exist, then it would emit an + // error. + SpeechModelVariant speech_model_variant = 1; + + // Which Speech model to select. Select the model best suited to your domain + // to get best results. If a model is not explicitly specified, then a default + // model is used. + // Refer to + // [Cloud Speech API + // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) + // for more details. + string model = 2; +} + // [DTMF](https://en.wikipedia.org/wiki/Dual-tone_multi-frequency_signaling) // digit in Telephony Gateway. enum TelephonyDtmf { @@ -441,32 +336,137 @@ enum TelephonyDtmf { DTMF_POUND = 16; } -// A wrapper of repeated TelephonyDtmf digits. -message TelephonyDtmfEvents { - // A sequence of TelephonyDtmf digits. - repeated TelephonyDtmf dtmf_events = 1; +// Audio encoding of the audio content sent in the conversational query request. +// Refer to the +// [Cloud Speech API +// documentation](https://cloud.google.com/speech-to-text/docs/basics) for more +// details. +enum AudioEncoding { + // Not specified. + AUDIO_ENCODING_UNSPECIFIED = 0; + + // Uncompressed 16-bit signed little-endian samples (Linear PCM). + AUDIO_ENCODING_LINEAR_16 = 1; + + // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio + // Codec) is the recommended encoding because it is lossless (therefore + // recognition is not compromised) and requires only about half the + // bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and + // 24-bit samples, however, not all fields in `STREAMINFO` are supported. + AUDIO_ENCODING_FLAC = 2; + + // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. + AUDIO_ENCODING_MULAW = 3; + + // Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. + AUDIO_ENCODING_AMR = 4; + + // Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. + AUDIO_ENCODING_AMR_WB = 5; + + // Opus encoded audio frames in Ogg container + // ([OggOpus](https://wiki.xiph.org/OggOpus)). + // `sample_rate_hertz` must be 16000. + AUDIO_ENCODING_OGG_OPUS = 6; + + // Although the use of lossy encodings is not recommended, if a very low + // bitrate encoding is required, `OGG_OPUS` is highly preferred over + // Speex encoding. The [Speex](https://speex.org/) encoding supported by + // Dialogflow API has a header byte in each block, as in MIME type + // `audio/x-speex-with-header-byte`. + // It is a variant of the RTP Speex encoding defined in + // [RFC 5574](https://tools.ietf.org/html/rfc5574). + // The stream is a sequence of blocks, one block per RTP packet. Each block + // starts with a byte containing the length of the block, in bytes, followed + // by one or more frames of Speex data, padded to an integral number of + // bytes (octets) as specified in RFC 5574. In other words, each RTP header + // is replaced with a single byte containing the block length. Only Speex + // wideband is supported. `sample_rate_hertz` must be 16000. + AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; } -// Configures speech transcription for -// [ConversationProfile][google.cloud.dialogflow.v2.ConversationProfile]. -message SpeechToTextConfig { - // The speech model used in speech to text. - // `SPEECH_MODEL_VARIANT_UNSPECIFIED`, `USE_BEST_AVAILABLE` will be treated as - // `USE_ENHANCED`. It can be overridden in - // [AnalyzeContentRequest][google.cloud.dialogflow.v2.AnalyzeContentRequest] - // and - // [StreamingAnalyzeContentRequest][google.cloud.dialogflow.v2.StreamingAnalyzeContentRequest] - // request. If enhanced model variant is specified and an enhanced version of - // the specified model for the language does not exist, then it would emit an - // error. - SpeechModelVariant speech_model_variant = 1; +// Variant of the specified [Speech +// model][google.cloud.dialogflow.v2.InputAudioConfig.model] to use. +// +// See the [Cloud Speech +// documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) +// for which models have different variants. For example, the "phone_call" model +// has both a standard and an enhanced variant. When you use an enhanced model, +// you will generally receive higher quality results than for a standard model. +enum SpeechModelVariant { + // No model variant specified. In this case Dialogflow defaults to + // USE_BEST_AVAILABLE. + SPEECH_MODEL_VARIANT_UNSPECIFIED = 0; - // Which Speech model to select. Select the model best suited to your domain - // to get best results. If a model is not explicitly specified, then a default - // model is used. - // Refer to - // [Cloud Speech API - // documentation](https://cloud.google.com/speech-to-text/docs/basics#select-model) - // for more details. - string model = 2; + // Use the best available variant of the [Speech + // model][InputAudioConfig.model] that the caller is eligible for. + // + // Please see the [Dialogflow + // docs](https://cloud.google.com/dialogflow/docs/data-logging) for + // how to make your project eligible for enhanced models. + USE_BEST_AVAILABLE = 1; + + // Use standard model variant even if an enhanced model is available. See the + // [Cloud Speech + // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) + // for details about enhanced models. + USE_STANDARD = 2; + + // Use an enhanced model variant: + // + // * If an enhanced variant does not exist for the given + // [model][google.cloud.dialogflow.v2.InputAudioConfig.model] and request + // language, Dialogflow falls back to the standard variant. + // + // The [Cloud Speech + // documentation](https://cloud.google.com/speech-to-text/docs/enhanced-models) + // describes which models have enhanced variants. + // + // * If the API caller isn't eligible for enhanced models, Dialogflow returns + // an error. Please see the [Dialogflow + // docs](https://cloud.google.com/dialogflow/docs/data-logging) + // for how to make your project eligible. + USE_ENHANCED = 3; +} + +// Gender of the voice as described in +// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). +enum SsmlVoiceGender { + // An unspecified gender, which means that the client doesn't care which + // gender the selected voice will have. + SSML_VOICE_GENDER_UNSPECIFIED = 0; + + // A male voice. + SSML_VOICE_GENDER_MALE = 1; + + // A female voice. + SSML_VOICE_GENDER_FEMALE = 2; + + // A gender-neutral voice. + SSML_VOICE_GENDER_NEUTRAL = 3; +} + +// Audio encoding of the output audio format in Text-To-Speech. +enum OutputAudioEncoding { + // Not specified. + OUTPUT_AUDIO_ENCODING_UNSPECIFIED = 0; + + // Uncompressed 16-bit signed little-endian samples (Linear PCM). + // Audio content returned as LINEAR16 also contains a WAV header. + OUTPUT_AUDIO_ENCODING_LINEAR_16 = 1; + + // MP3 audio at 32kbps. + OUTPUT_AUDIO_ENCODING_MP3 = 2; + + // MP3 audio at 64kbps. + OUTPUT_AUDIO_ENCODING_MP3_64_KBPS = 4; + + // Opus encoded audio wrapped in an ogg container. The result will be a + // file which can be played natively on Android, and in browsers (at least + // Chrome and Firefox). The quality of the encoding is considerably higher + // than MP3 while using approximately the same bitrate. + OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3; + + // 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. + OUTPUT_AUDIO_ENCODING_MULAW = 5; }