ms_edge_tts.ts
15.0 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// import axios from "axios";
import { Buffer } from "buffer";
import { randomBytes } from "crypto";
import { Readable } from "stream";
// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
*/
export enum VOLUME {
SILENT = "silent",
X_SOFT = "x-soft",
SOFT = "soft",
MEDIUM = "medium",
LOUD = "loud",
X_LOUD = "x-LOUD",
DEFAULT = "default",
}
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
*/
export enum RATE {
X_SLOW = "x-slow",
SLOW = "slow",
MEDIUM = "medium",
FAST = "fast",
X_FAST = "x-fast",
DEFAULT = "default",
}
/**
* https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
*/
export enum PITCH {
X_LOW = "x-low",
LOW = "low",
MEDIUM = "medium",
HIGH = "high",
X_HIGH = "x-high",
DEFAULT = "default",
}
/**
* Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
*/
export enum OUTPUT_FORMAT {
// Streaming =============================
// AMR_WB_16000HZ = "amr-wb-16000hz",
// AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
// AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
// AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
// AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
// AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
// AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
// AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
// AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
// AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
// OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
// OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
// OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
// RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
// RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
// RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
// RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
// RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
// RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
// RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
// RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
// RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
// RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
// WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
// WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
// Non-streaming =============================
// RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
// RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
// RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
// RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
// RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
// RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
// RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
}
export type Voice = {
Name: string;
ShortName: string;
Gender: string;
Locale: string;
SuggestedCodec: string;
FriendlyName: string;
Status: string;
};
export class ProsodyOptions {
/**
* The pitch to use.
* Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
*/
pitch?: PITCH | string = "+0Hz";
/**
* The rate to use.
* Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
*/
rate?: RATE | string | number = 1.0;
/**
* The volume to use.
* Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
* [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
*/
volume?: VOLUME | string | number = 100.0;
}
export class MsEdgeTTS {
static OUTPUT_FORMAT = OUTPUT_FORMAT;
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
private static BINARY_DELIM = "Path:audio\r\n";
private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
private readonly _enableLogger;
private _ws: WebSocket | undefined;
private _voice: any;
private _voiceLocale: any;
private _outputFormat: any;
private _streams: { [key: string]: Readable } = {};
private _startTime = 0;
private _log(...o: any[]) {
if (this._enableLogger) {
console.log(...o);
}
}
/**
* Create a new `MsEdgeTTS` instance.
*
* @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
* @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
*/
public constructor(enableLogger: boolean = false) {
this._enableLogger = enableLogger;
}
private async _send(message: any) {
for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
if (i == 1) {
this._startTime = Date.now();
}
this._log("connecting: ", i);
await this._initClient();
}
this._ws!.send(message);
}
private _initClient() {
this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
this._ws.binaryType = "arraybuffer";
return new Promise((resolve, reject) => {
this._ws!.onopen = () => {
this._log(
"Connected in",
(Date.now() - this._startTime) / 1000,
"seconds",
);
this._send(
`Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
{
"context": {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "false",
"wordBoundaryEnabled": "false"
},
"outputFormat": "${this._outputFormat}"
}
}
}
}
`,
).then(resolve);
};
this._ws!.onmessage = (m: any) => {
const buffer = Buffer.from(m.data as ArrayBuffer);
const message = buffer.toString();
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
if (message.includes("Path:turn.start")) {
// start of turn, ignore
} else if (message.includes("Path:turn.end")) {
// end of turn, close stream
this._streams[requestId].push(null);
} else if (message.includes("Path:response")) {
// context response, ignore
} else if (
message.includes("Path:audio") &&
m.data instanceof ArrayBuffer
) {
this._pushAudioData(buffer, requestId);
} else {
this._log("UNKNOWN MESSAGE", message);
}
};
this._ws!.onclose = () => {
this._log(
"disconnected after:",
(Date.now() - this._startTime) / 1000,
"seconds",
);
for (const requestId in this._streams) {
this._streams[requestId].push(null);
}
};
this._ws!.onerror = function (error: any) {
reject("Connect Error: " + error);
};
});
}
private _pushAudioData(audioBuffer: Buffer, requestId: string) {
const audioStartIndex =
audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
MsEdgeTTS.BINARY_DELIM.length;
const audioData = audioBuffer.subarray(audioStartIndex);
this._streams[requestId].push(audioData);
this._log("received audio chunk, size: ", audioData?.length);
}
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
// in case future updates to the edge API block these elements, we'll be concatenating strings.
options = { ...new ProsodyOptions(), ...options };
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
<voice name="${this._voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${input}
</prosody>
</voice>
</speak>`;
}
/**
* Fetch the list of voices available in Microsoft Edge.
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
*/
// getVoices(): Promise<Voice[]> {
// return new Promise((resolve, reject) => {
// axios
// .get(MsEdgeTTS.VOICES_URL)
// .then((res) => resolve(res.data))
// .catch(reject);
// });
// }
getVoices(): Promise<Voice[]> {
return fetch(MsEdgeTTS.VOICES_URL)
.then((response) => {
if (!response.ok) {
throw new Error("Network response was not ok");
}
return response.json();
})
.then((data) => data as Voice[])
.catch((error) => {
throw error;
});
}
/**
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
* Must be called at least once before text can be synthesised.
* Saved in this instance. Can be called at any time times to update the metadata.
*
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
* @param outputFormat any {@link OUTPUT_FORMAT}
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
*/
async setMetadata(
voiceName: string,
outputFormat: OUTPUT_FORMAT,
voiceLocale?: string,
) {
const oldVoice = this._voice;
const oldVoiceLocale = this._voiceLocale;
const oldOutputFormat = this._outputFormat;
this._voice = voiceName;
this._voiceLocale = voiceLocale;
if (!this._voiceLocale) {
const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
if (!voiceLangMatch)
throw new Error("Could not infer voiceLocale from voiceName!");
this._voiceLocale = voiceLangMatch[0];
}
this._outputFormat = outputFormat;
const changed =
oldVoice !== this._voice ||
oldVoiceLocale !== this._voiceLocale ||
oldOutputFormat !== this._outputFormat;
// create new client
if (changed || this._ws!.readyState !== this._ws!.OPEN) {
this._startTime = Date.now();
await this._initClient();
}
}
private _metadataCheck() {
if (!this._ws)
throw new Error(
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
);
}
/**
* Close the WebSocket connection.
*/
close() {
this._ws!.close();
}
/**
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
*
* @param input the text to synthesise. Can include SSML elements.
* @param options (optional) {@link ProsodyOptions}
* @returns {Readable} - a `stream.Readable` with the audio data
*/
toStream(input: string, options?: ProsodyOptions): Readable {
const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
return stream;
}
toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
let data: Uint8Array[] = [];
const readable = this.toStream(input, options);
readable.on("data", (chunk) => {
data.push(chunk);
});
readable.on("end", () => {
resolve(Buffer.concat(data).buffer);
});
readable.on("error", (err) => {
reject(err);
});
});
}
/**
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
*
* @param requestSSML the SSML to send. SSML elements required in order to work.
* @returns {Readable} - a `stream.Readable` with the audio data
*/
rawToStream(requestSSML: string): Readable {
const { stream } = this._rawSSMLRequest(requestSSML);
return stream;
}
private _rawSSMLRequest(requestSSML: string): {
stream: Readable;
requestId: string;
} {
this._metadataCheck();
const requestId = randomBytes(16).toString("hex");
const request =
`X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
` + requestSSML.trim();
// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
const self = this;
const stream = new Readable({
read() {},
destroy(error: Error | null, callback: (error: Error | null) => void) {
delete self._streams[requestId];
callback(error);
},
});
this._streams[requestId] = stream;
this._send(request).then();
return { stream, requestId };
}
}