// import axios from "axios"; import { Buffer } from "buffer"; import { randomBytes } from "crypto"; import { Readable } from "stream"; // Modified according to https://github.com/Migushthe2nd/MsEdgeTTS /** * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume */ export enum VOLUME { SILENT = "silent", X_SOFT = "x-soft", SOFT = "soft", MEDIUM = "medium", LOUD = "loud", X_LOUD = "x-LOUD", DEFAULT = "default", } /** * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking */ export enum RATE { X_SLOW = "x-slow", SLOW = "slow", MEDIUM = "medium", FAST = "fast", X_FAST = "x-fast", DEFAULT = "default", } /** * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline */ export enum PITCH { X_LOW = "x-low", LOW = "low", MEDIUM = "medium", HIGH = "high", X_HIGH = "x-high", DEFAULT = "default", } /** * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted. */ export enum OUTPUT_FORMAT { // Streaming ============================= // AMR_WB_16000HZ = "amr-wb-16000hz", // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus", // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3", // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3", // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3", // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus", // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus", AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3", AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3", // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3", // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3", // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3", // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus", // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus", // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus", // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw", // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw", // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm", // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm", // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk", // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm", // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm", // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk", // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm", // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm", // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus", // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus", WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus", // Non-streaming ============================= // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw", // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw", // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm", // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm", // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm", // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm", // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm", } export type Voice = { Name: string; ShortName: string; Gender: string; Locale: string; SuggestedCodec: string; FriendlyName: string; Status: string; }; export class ProsodyOptions { /** * The pitch to use. * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%). * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline) */ pitch?: PITCH | string = "+0Hz"; /** * The rate to use. * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%). * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking) */ rate?: RATE | string | number = 1.0; /** * The volume to use. * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%). * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume) */ volume?: VOLUME | string | number = 100.0; } export class MsEdgeTTS { static OUTPUT_FORMAT = OUTPUT_FORMAT; private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; private static BINARY_DELIM = "Path:audio\r\n"; private static VOICE_LANG_REGEX = /\w{2}-\w{2}/; private readonly _enableLogger; private _ws: WebSocket | undefined; private _voice: any; private _voiceLocale: any; private _outputFormat: any; private _streams: { [key: string]: Readable } = {}; private _startTime = 0; private _log(...o: any[]) { if (this._enableLogger) { console.log(...o); } } /** * Create a new `MsEdgeTTS` instance. * * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent). * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console */ public constructor(enableLogger: boolean = false) { this._enableLogger = enableLogger; } private async _send(message: any) { for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) { if (i == 1) { this._startTime = Date.now(); } this._log("connecting: ", i); await this._initClient(); } this._ws!.send(message); } private _initClient() { this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL); this._ws.binaryType = "arraybuffer"; return new Promise((resolve, reject) => { this._ws!.onopen = () => { this._log( "Connected in", (Date.now() - this._startTime) / 1000, "seconds", ); this._send( `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n { "context": { "synthesis": { "audio": { "metadataoptions": { "sentenceBoundaryEnabled": "false", "wordBoundaryEnabled": "false" }, "outputFormat": "${this._outputFormat}" } } } } `, ).then(resolve); }; this._ws!.onmessage = (m: any) => { const buffer = Buffer.from(m.data as ArrayBuffer); const message = buffer.toString(); const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1]; if (message.includes("Path:turn.start")) { // start of turn, ignore } else if (message.includes("Path:turn.end")) { // end of turn, close stream this._streams[requestId].push(null); } else if (message.includes("Path:response")) { // context response, ignore } else if ( message.includes("Path:audio") && m.data instanceof ArrayBuffer ) { this._pushAudioData(buffer, requestId); } else { this._log("UNKNOWN MESSAGE", message); } }; this._ws!.onclose = () => { this._log( "disconnected after:", (Date.now() - this._startTime) / 1000, "seconds", ); for (const requestId in this._streams) { this._streams[requestId].push(null); } }; this._ws!.onerror = function (error: any) { reject("Connect Error: " + error); }; }); } private _pushAudioData(audioBuffer: Buffer, requestId: string) { const audioStartIndex = audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + MsEdgeTTS.BINARY_DELIM.length; const audioData = audioBuffer.subarray(audioStartIndex); this._streams[requestId].push(audioData); this._log("received audio chunk, size: ", audioData?.length); } private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string { // in case future updates to the edge API block these elements, we'll be concatenating strings. options = { ...new ProsodyOptions(), ...options }; return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}"> <voice name="${this._voice}"> <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}"> ${input} </prosody> </voice> </speak>`; } /** * Fetch the list of voices available in Microsoft Edge. * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). */ // getVoices(): Promise<Voice[]> { // return new Promise((resolve, reject) => { // axios // .get(MsEdgeTTS.VOICES_URL) // .then((res) => resolve(res.data)) // .catch(reject); // }); // } getVoices(): Promise<Voice[]> { return fetch(MsEdgeTTS.VOICES_URL) .then((response) => { if (!response.ok) { throw new Error("Network response was not ok"); } return response.json(); }) .then((data) => data as Voice[]) .catch((error) => { throw error; }); } /** * Sets the required information for the speech to be synthesised and inits a new WebSocket connection. * Must be called at least once before text can be synthesised. * Saved in this instance. Can be called at any time times to update the metadata. * * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) * @param outputFormat any {@link OUTPUT_FORMAT} * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` */ async setMetadata( voiceName: string, outputFormat: OUTPUT_FORMAT, voiceLocale?: string, ) { const oldVoice = this._voice; const oldVoiceLocale = this._voiceLocale; const oldOutputFormat = this._outputFormat; this._voice = voiceName; this._voiceLocale = voiceLocale; if (!this._voiceLocale) { const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice); if (!voiceLangMatch) throw new Error("Could not infer voiceLocale from voiceName!"); this._voiceLocale = voiceLangMatch[0]; } this._outputFormat = outputFormat; const changed = oldVoice !== this._voice || oldVoiceLocale !== this._voiceLocale || oldOutputFormat !== this._outputFormat; // create new client if (changed || this._ws!.readyState !== this._ws!.OPEN) { this._startTime = Date.now(); await this._initClient(); } } private _metadataCheck() { if (!this._ws) throw new Error( "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.", ); } /** * Close the WebSocket connection. */ close() { this._ws!.close(); } /** * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. * * @param input the text to synthesise. Can include SSML elements. * @param options (optional) {@link ProsodyOptions} * @returns {Readable} - a `stream.Readable` with the audio data */ toStream(input: string, options?: ProsodyOptions): Readable { const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options)); return stream; } toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> { return new Promise((resolve, reject) => { let data: Uint8Array[] = []; const readable = this.toStream(input, options); readable.on("data", (chunk) => { data.push(chunk); }); readable.on("end", () => { resolve(Buffer.concat(data).buffer); }); readable.on("error", (err) => { reject(err); }); }); } /** * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. * * @param requestSSML the SSML to send. SSML elements required in order to work. * @returns {Readable} - a `stream.Readable` with the audio data */ rawToStream(requestSSML: string): Readable { const { stream } = this._rawSSMLRequest(requestSSML); return stream; } private _rawSSMLRequest(requestSSML: string): { stream: Readable; requestId: string; } { this._metadataCheck(); const requestId = randomBytes(16).toString("hex"); const request = `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n ` + requestSSML.trim(); // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup const self = this; const stream = new Readable({ read() {}, destroy(error: Error | null, callback: (error: Error | null) => void) { delete self._streams[requestId]; callback(error); }, }); this._streams[requestId] = stream; this._send(request).then(); return { stream, requestId }; } }