ms_edge_tts.ts 15.0 KB
// import axios from "axios";
import { Buffer } from "buffer";
import { randomBytes } from "crypto";
import { Readable } from "stream";

// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS

/**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
 */
export enum VOLUME {
  SILENT = "silent",
  X_SOFT = "x-soft",
  SOFT = "soft",
  MEDIUM = "medium",
  LOUD = "loud",
  X_LOUD = "x-LOUD",
  DEFAULT = "default",
}

/**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
 */
export enum RATE {
  X_SLOW = "x-slow",
  SLOW = "slow",
  MEDIUM = "medium",
  FAST = "fast",
  X_FAST = "x-fast",
  DEFAULT = "default",
}

/**
 * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
 */
export enum PITCH {
  X_LOW = "x-low",
  LOW = "low",
  MEDIUM = "medium",
  HIGH = "high",
  X_HIGH = "x-high",
  DEFAULT = "default",
}

/**
 * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
 */
export enum OUTPUT_FORMAT {
  // Streaming =============================
  // AMR_WB_16000HZ = "amr-wb-16000hz",
  // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
  // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
  // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
  // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
  // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
  // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
  AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
  AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
  // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
  // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
  // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
  // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
  // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
  // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
  // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
  // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
  // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
  // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
  // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
  // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
  // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
  // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
  // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
  // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
  // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
  // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
  WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
  // Non-streaming =============================
  // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
  // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
  // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
  // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
  // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
  // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
  // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
}

export type Voice = {
  Name: string;
  ShortName: string;
  Gender: string;
  Locale: string;
  SuggestedCodec: string;
  FriendlyName: string;
  Status: string;
};

export class ProsodyOptions {
  /**
   * The pitch to use.
   * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
   */
  pitch?: PITCH | string = "+0Hz";
  /**
   * The rate to use.
   * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
   */
  rate?: RATE | string | number = 1.0;
  /**
   * The volume to use.
   * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
   */
  volume?: VOLUME | string | number = 100.0;
}

export class MsEdgeTTS {
  static OUTPUT_FORMAT = OUTPUT_FORMAT;
  private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
  private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  private static BINARY_DELIM = "Path:audio\r\n";
  private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
  private readonly _enableLogger;
  private _ws: WebSocket | undefined;
  private _voice: any;
  private _voiceLocale: any;
  private _outputFormat: any;
  private _streams: { [key: string]: Readable } = {};
  private _startTime = 0;

  private _log(...o: any[]) {
    if (this._enableLogger) {
      console.log(...o);
    }
  }

  /**
   * Create a new `MsEdgeTTS` instance.
   *
   * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
   * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
   */
  public constructor(enableLogger: boolean = false) {
    this._enableLogger = enableLogger;
  }

  private async _send(message: any) {
    for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
      if (i == 1) {
        this._startTime = Date.now();
      }
      this._log("connecting: ", i);
      await this._initClient();
    }
    this._ws!.send(message);
  }

  private _initClient() {
    this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);

    this._ws.binaryType = "arraybuffer";
    return new Promise((resolve, reject) => {
      this._ws!.onopen = () => {
        this._log(
          "Connected in",
          (Date.now() - this._startTime) / 1000,
          "seconds",
        );
        this._send(
          `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
                    {
                        "context": {
                            "synthesis": {
                                "audio": {
                                    "metadataoptions": {
                                        "sentenceBoundaryEnabled": "false",
                                        "wordBoundaryEnabled": "false"
                                    },
                                    "outputFormat": "${this._outputFormat}" 
                                }
                            }
                        }
                    }
                `,
        ).then(resolve);
      };
      this._ws!.onmessage = (m: any) => {
        const buffer = Buffer.from(m.data as ArrayBuffer);
        const message = buffer.toString();
        const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
        if (message.includes("Path:turn.start")) {
          // start of turn, ignore
        } else if (message.includes("Path:turn.end")) {
          // end of turn, close stream
          this._streams[requestId].push(null);
        } else if (message.includes("Path:response")) {
          // context response, ignore
        } else if (
          message.includes("Path:audio") &&
          m.data instanceof ArrayBuffer
        ) {
          this._pushAudioData(buffer, requestId);
        } else {
          this._log("UNKNOWN MESSAGE", message);
        }
      };
      this._ws!.onclose = () => {
        this._log(
          "disconnected after:",
          (Date.now() - this._startTime) / 1000,
          "seconds",
        );
        for (const requestId in this._streams) {
          this._streams[requestId].push(null);
        }
      };
      this._ws!.onerror = function (error: any) {
        reject("Connect Error: " + error);
      };
    });
  }

  private _pushAudioData(audioBuffer: Buffer, requestId: string) {
    const audioStartIndex =
      audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
      MsEdgeTTS.BINARY_DELIM.length;
    const audioData = audioBuffer.subarray(audioStartIndex);
    this._streams[requestId].push(audioData);
    this._log("received audio chunk, size: ", audioData?.length);
  }

  private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
    // in case future updates to the edge API block these elements, we'll be concatenating strings.
    options = { ...new ProsodyOptions(), ...options };
    return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
                <voice name="${this._voice}">
                    <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
                        ${input}
                    </prosody> 
                </voice>
            </speak>`;
  }

  /**
   * Fetch the list of voices available in Microsoft Edge.
   * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
   */
  // getVoices(): Promise<Voice[]> {
  //   return new Promise((resolve, reject) => {
  //     axios
  //       .get(MsEdgeTTS.VOICES_URL)
  //       .then((res) => resolve(res.data))
  //       .catch(reject);
  //   });
  // }
  getVoices(): Promise<Voice[]> {
    return fetch(MsEdgeTTS.VOICES_URL)
      .then((response) => {
        if (!response.ok) {
          throw new Error("Network response was not ok");
        }
        return response.json();
      })
      .then((data) => data as Voice[])
      .catch((error) => {
        throw error;
      });
  }

  /**
   * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
   * Must be called at least once before text can be synthesised.
   * Saved in this instance. Can be called at any time times to update the metadata.
   *
   * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
   * @param outputFormat any {@link OUTPUT_FORMAT}
   * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
   */
  async setMetadata(
    voiceName: string,
    outputFormat: OUTPUT_FORMAT,
    voiceLocale?: string,
  ) {
    const oldVoice = this._voice;
    const oldVoiceLocale = this._voiceLocale;
    const oldOutputFormat = this._outputFormat;

    this._voice = voiceName;
    this._voiceLocale = voiceLocale;
    if (!this._voiceLocale) {
      const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
      if (!voiceLangMatch)
        throw new Error("Could not infer voiceLocale from voiceName!");
      this._voiceLocale = voiceLangMatch[0];
    }
    this._outputFormat = outputFormat;

    const changed =
      oldVoice !== this._voice ||
      oldVoiceLocale !== this._voiceLocale ||
      oldOutputFormat !== this._outputFormat;

    // create new client
    if (changed || this._ws!.readyState !== this._ws!.OPEN) {
      this._startTime = Date.now();
      await this._initClient();
    }
  }

  private _metadataCheck() {
    if (!this._ws)
      throw new Error(
        "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
      );
  }

  /**
   * Close the WebSocket connection.
   */
  close() {
    this._ws!.close();
  }

  /**
   * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
   *
   * @param input the text to synthesise. Can include SSML elements.
   * @param options (optional) {@link ProsodyOptions}
   * @returns {Readable} - a `stream.Readable` with the audio data
   */
  toStream(input: string, options?: ProsodyOptions): Readable {
    const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
    return stream;
  }

  toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
    return new Promise((resolve, reject) => {
      let data: Uint8Array[] = [];
      const readable = this.toStream(input, options);
      readable.on("data", (chunk) => {
        data.push(chunk);
      });

      readable.on("end", () => {
        resolve(Buffer.concat(data).buffer);
      });

      readable.on("error", (err) => {
        reject(err);
      });
    });
  }

  /**
   * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
   *
   * @param requestSSML the SSML to send. SSML elements required in order to work.
   * @returns {Readable} - a `stream.Readable` with the audio data
   */
  rawToStream(requestSSML: string): Readable {
    const { stream } = this._rawSSMLRequest(requestSSML);
    return stream;
  }

  private _rawSSMLRequest(requestSSML: string): {
    stream: Readable;
    requestId: string;
  } {
    this._metadataCheck();

    const requestId = randomBytes(16).toString("hex");
    const request =
      `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
                ` + requestSSML.trim();
    // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
    const self = this;
    const stream = new Readable({
      read() {},
      destroy(error: Error | null, callback: (error: Error | null) => void) {
        delete self._streams[requestId];
        callback(error);
      },
    });
    this._streams[requestId] = stream;
    this._send(request).then();
    return { stream, requestId };
  }
}