C#からVOICEVOXの音声再生&音声をwavで保存するユーティリティーを作ったのでソースコードを掲載しておきます。
using System.Collections.Generic;
using System.Media;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading.Tasks;
public static class VoicevoxUtility
{
const string baseUrl = "http://127.0.0.1:50021/"; // localhostだとレスポンスが遅いのアドレス指定
private static readonly HttpClient httpClient = new HttpClient();
public static async Task Speek(string text, int speakerId)
{
string query = await CreateAudioQuery(text, speakerId);
// 音声合成
using var request = new HttpRequestMessage(new HttpMethod("POST"), $"{baseUrl}synthesis?speaker={speakerId}&enable_interrogative_upspeak=true");
request.Headers.TryAddWithoutValidation("accept", "audio/wav");
request.Content = new StringContent(query);
request.Content.Headers.ContentType = MediaTypeHeaderValue.Parse("application/json");
var response = await httpClient.SendAsync(request);
// 音声再生
using var httpStream = await response.Content.ReadAsStreamAsync();
var player = new SoundPlayer(httpStream);
player.PlaySync();
}
public static async Task RecordSpeech(string outputWaveFilePath, string text, int speaker)
{
string query = await CreateAudioQuery(text, speaker);
// 音声合成
using var request = new HttpRequestMessage(new HttpMethod("POST"), $"{baseUrl}synthesis?speaker={speaker}&enable_interrogative_upspeak=true");
request.Headers.TryAddWithoutValidation("accept", "audio/wav");
request.Content = new StringContent(query);
request.Content.Headers.ContentType = MediaTypeHeaderValue.Parse("application/json");
var response = await httpClient.SendAsync(request);
// 書き出し
using var fs = System.IO.File.Create(outputWaveFilePath);
using var stream = await response.Content.ReadAsStreamAsync();
stream.CopyTo(fs);
fs.Flush();
}
private static async Task<string> CreateAudioQuery(string text, int speakerId)
{
using var requestMessage = new HttpRequestMessage(new HttpMethod("POST"), $"{baseUrl}audio_query?text={text}&speaker={speakerId}");
requestMessage.Headers.TryAddWithoutValidation("accept", "application/json");
requestMessage.Content = new StringContent("");
requestMessage.Content.Headers.ContentType = MediaTypeHeaderValue.Parse("application/x-www-form-urlencoded");
var response = await httpClient.SendAsync(requestMessage);
return await response.Content.ReadAsStringAsync();
}
}
以下は使用例です。
// 直接再生
VoicevoxUtility.Speek("これは直接再生するテストです", 39).Wait();
// 音声ファイルに保存してから再生
var wavePath = "output.wav";
VoicevoxUtility.RecordSpeech(wavePath, "これは録音テストです", 39).Wait();
var player = new SoundPlayer(wavePath);
player.PlaySync();
ちなみに audio_query からのレスポンスは以下のような感じになりました(見やすく整形してます)。
{
"accent_phrases":[
{
"moras":[
{
"text":"コ",
"consonant":"k",
"consonant_length":0.06594131141901016,
"vowel":"o",
"vowel_length":0.06108786165714264,
"pitch":5.282137870788574
},
{
"text":"レ",
"consonant":"r",
"consonant_length":0.021974779665470123,
"vowel":"e",
"vowel_length":0.058412015438079834,
"pitch":5.355251789093018
},
{
"text":"ワ",
"consonant":"w",
"consonant_length":0.03409413993358612,
"vowel":"a",
"vowel_length":0.0845031589269638,
"pitch":5.379842281341553
}
],
"accent":3,
"pause_mora":null,
"is_interrogative":false
},
{
"moras":[
{
"text":"チョ",
"consonant":"ch",
"consonant_length":0.0630991980433464,
"vowel":"o",
"vowel_length":0.060945361852645874,
"pitch":5.118475437164307
},
{
"text":"ク",
"consonant":"k",
"consonant_length":0.04479902237653732,
"vowel":"U",
"vowel_length":0.0330185741186142,
"pitch":0.0
},
{
"text":"セ",
"consonant":"s",
"consonant_length":0.049481965601444244,
"vowel":"e",
"vowel_length":0.06068601459264755,
"pitch":5.565320014953613
},
{
"text":"ツ",
"consonant":"ts",
"consonant_length":0.0557292103767395,
"vowel":"U",
"vowel_length":0.039859622716903687,
"pitch":0.0
}
],
"accent":4,
"pause_mora":null,
"is_interrogative":false
},
{
"moras":[
{
"text":"サ",
"consonant":"s",
"consonant_length":0.06819044798612595,
"vowel":"a",
"vowel_length":0.06987302750349045,
"pitch":5.522080898284912
},
{
"text":"イ",
"consonant":null,
"consonant_length":null,
"vowel":"i",
"vowel_length":0.05302189290523529,
"pitch":5.533734321594238
},
{
"text":"セ",
"consonant":"s",
"consonant_length":0.07046963274478912,
"vowel":"e",
"vowel_length":0.052582331001758575,
"pitch":5.589731216430664
},
{
"text":"エ",
"consonant":null,
"consonant_length":null,
"vowel":"e",
"vowel_length":0.0676243007183075,
"pitch":5.560169219970703
}
],
"accent":4,
"pause_mora":null,
"is_interrogative":false
},
{
"moras":[
{
"text":"ス",
"consonant":"s",
"consonant_length":0.08211246132850647,
"vowel":"u",
"vowel_length":0.034779004752635956,
"pitch":5.563896179199219
},
{
"text":"ル",
"consonant":"r",
"consonant_length":0.0238371342420578,
"vowel":"u",
"vowel_length":0.060263603925704956,
"pitch":5.503627300262451
}
],
"accent":2,
"pause_mora":null,
"is_interrogative":false
},
{
"moras":[
{
"text":"テ",
"consonant":"t",
"consonant_length":0.04385934770107269,
"vowel":"e",
"vowel_length":0.057315394282341,
"pitch":5.5735626220703125
},
{
"text":"ス",
"consonant":"s",
"consonant_length":0.030951015651226044,
"vowel":"U",
"vowel_length":0.04976283758878708,
"pitch":0.0
},
{
"text":"ト",
"consonant":"t",
"consonant_length":0.04408574849367142,
"vowel":"o",
"vowel_length":0.055898360908031464,
"pitch":5.339382171630859
},
{
"text":"デ",
"consonant":"d",
"consonant_length":0.0317467525601387,
"vowel":"e",
"vowel_length":0.05603151023387909,
"pitch":5.168008804321289
},
{
"text":"ス",
"consonant":"s",
"consonant_length":0.05199216306209564,
"vowel":"U",
"vowel_length":0.10056409239768982,
"pitch":0.0
}
],
"accent":1,
"pause_mora":null,
"is_interrogative":false
}
],
"speedScale":1.0,
"pitchScale":0.0,
"intonationScale":1.0,
"volumeScale":1.0,
"prePhonemeLength":0.1,
"postPhonemeLength":0.1,
"outputSamplingRate":24000,
"outputStereo":false,
"kana":"コレワ'/チョ_クセ_ツ'/サイセエ'/スル'/テ'_ストデ_ス"
}