Audio.svelte 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. <script lang="ts">
  2. import { toast } from 'svelte-sonner';
  3. import { createEventDispatcher, onMount, getContext } from 'svelte';
  4. import { user, settings, config } from '$lib/stores';
  5. import { getVoices as _getVoices } from '$lib/apis/audio';
  6. import Switch from '$lib/components/common/Switch.svelte';
  7. import Spinner from '$lib/components/common/Spinner.svelte';
  8. import Tooltip from '$lib/components/common/Tooltip.svelte';
  9. const dispatch = createEventDispatcher();
  10. const i18n = getContext('i18n');
  11. export let saveSettings: Function;
  12. // Audio
  13. let conversationMode = false;
  14. let speechAutoSend = false;
  15. let responseAutoPlayback = false;
  16. let nonLocalVoices = false;
  17. let STTEngine = '';
  18. let STTLanguage = '';
  19. let TTSEngine = '';
  20. let TTSEngineConfig = {};
  21. let TTSModel = null;
  22. let TTSModelProgress = null;
  23. let TTSModelLoading = false;
  24. let voices = [];
  25. let voice = '';
  26. // Audio speed control
  27. let playbackRate = 1;
  28. const getVoices = async () => {
  29. if (TTSEngine === 'browser-kokoro') {
  30. if (!TTSModel) {
  31. await loadKokoro();
  32. }
  33. voices = Object.entries(TTSModel.voices).map(([key, value]) => {
  34. return {
  35. id: key,
  36. name: value.name,
  37. localService: false
  38. };
  39. });
  40. } else {
  41. if ($config.audio.tts.engine === '') {
  42. const getVoicesLoop = setInterval(async () => {
  43. voices = await speechSynthesis.getVoices();
  44. // do your loop
  45. if (voices.length > 0) {
  46. clearInterval(getVoicesLoop);
  47. }
  48. }, 100);
  49. } else {
  50. const res = await _getVoices(localStorage.token).catch((e) => {
  51. toast.error(`${e}`);
  52. });
  53. if (res) {
  54. console.log(res);
  55. voices = res.voices;
  56. }
  57. }
  58. }
  59. };
  60. const toggleResponseAutoPlayback = async () => {
  61. responseAutoPlayback = !responseAutoPlayback;
  62. saveSettings({ responseAutoPlayback: responseAutoPlayback });
  63. };
  64. const toggleSpeechAutoSend = async () => {
  65. speechAutoSend = !speechAutoSend;
  66. saveSettings({ speechAutoSend: speechAutoSend });
  67. };
  68. onMount(async () => {
  69. playbackRate = $settings.audio?.tts?.playbackRate ?? 1;
  70. conversationMode = $settings.conversationMode ?? false;
  71. speechAutoSend = $settings.speechAutoSend ?? false;
  72. responseAutoPlayback = $settings.responseAutoPlayback ?? false;
  73. STTEngine = $settings?.audio?.stt?.engine ?? '';
  74. STTLanguage = $settings?.audio?.stt?.language ?? '';
  75. TTSEngine = $settings?.audio?.tts?.engine ?? '';
  76. TTSEngineConfig = $settings?.audio?.tts?.engineConfig ?? {};
  77. if ($settings?.audio?.tts?.defaultVoice === $config.audio.tts.voice) {
  78. voice = $settings?.audio?.tts?.voice ?? $config.audio.tts.voice ?? '';
  79. } else {
  80. voice = $config.audio.tts.voice ?? '';
  81. }
  82. nonLocalVoices = $settings.audio?.tts?.nonLocalVoices ?? false;
  83. await getVoices();
  84. });
  85. $: if (TTSEngine && TTSEngineConfig) {
  86. onTTSEngineChange();
  87. }
  88. const onTTSEngineChange = async () => {
  89. if (TTSEngine === 'browser-kokoro') {
  90. await loadKokoro();
  91. }
  92. };
  93. const loadKokoro = async () => {
  94. if (TTSEngine === 'browser-kokoro') {
  95. voices = [];
  96. if (TTSEngineConfig?.dtype) {
  97. TTSModel = null;
  98. TTSModelProgress = null;
  99. TTSModelLoading = true;
  100. const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX';
  101. const { KokoroTTS } = await import('kokoro-js');
  102. TTSModel = await KokoroTTS.from_pretrained(model_id, {
  103. dtype: TTSEngineConfig.dtype, // Options: "fp32", "fp16", "q8", "q4", "q4f16"
  104. device: !!navigator?.gpu ? 'webgpu' : 'wasm', // Detect WebGPU
  105. progress_callback: (e) => {
  106. TTSModelProgress = e;
  107. console.log(e);
  108. }
  109. });
  110. await getVoices();
  111. // const rawAudio = await tts.generate(inputText, {
  112. // // Use `tts.list_voices()` to list all available voices
  113. // voice: voice
  114. // });
  115. // const blobUrl = URL.createObjectURL(await rawAudio.toBlob());
  116. // const audio = new Audio(blobUrl);
  117. // audio.play();
  118. }
  119. }
  120. };
  121. </script>
  122. <form
  123. id="tab-audio"
  124. class="flex flex-col h-full justify-between space-y-3 text-sm"
  125. on:submit|preventDefault={async () => {
  126. saveSettings({
  127. audio: {
  128. stt: {
  129. engine: STTEngine !== '' ? STTEngine : undefined,
  130. language: STTLanguage !== '' ? STTLanguage : undefined
  131. },
  132. tts: {
  133. engine: TTSEngine !== '' ? TTSEngine : undefined,
  134. engineConfig: TTSEngineConfig,
  135. playbackRate: playbackRate,
  136. voice: voice !== '' ? voice : undefined,
  137. defaultVoice: $config?.audio?.tts?.voice ?? '',
  138. nonLocalVoices: $config.audio.tts.engine === '' ? nonLocalVoices : undefined
  139. }
  140. }
  141. });
  142. dispatch('save');
  143. }}
  144. >
  145. <div class=" space-y-3 overflow-y-scroll max-h-[28rem] lg:max-h-full">
  146. <div>
  147. <div class=" mb-1 text-sm font-medium">{$i18n.t('STT Settings')}</div>
  148. {#if $config.audio.stt.engine !== 'web'}
  149. <div class=" py-0.5 flex w-full justify-between">
  150. <div class=" self-center text-xs font-medium">{$i18n.t('Speech-to-Text Engine')}</div>
  151. <div class="flex items-center relative">
  152. <select
  153. class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 p-1 text-xs bg-transparent outline-hidden text-right"
  154. bind:value={STTEngine}
  155. placeholder={$i18n.t('Select an engine')}
  156. >
  157. <option value="">{$i18n.t('Default')}</option>
  158. <option value="web">{$i18n.t('Web API')}</option>
  159. </select>
  160. </div>
  161. </div>
  162. <div class=" py-0.5 flex w-full justify-between">
  163. <div class=" self-center text-xs font-medium">{$i18n.t('Language')}</div>
  164. <div class="flex items-center relative text-xs px-3">
  165. <Tooltip
  166. content={$i18n.t(
  167. 'The language of the input audio. Supplying the input language in ISO-639-1 (e.g. en) format will improve accuracy and latency. Leave blank to automatically detect the language.'
  168. )}
  169. placement="top"
  170. >
  171. <input
  172. type="text"
  173. bind:value={STTLanguage}
  174. placeholder={$i18n.t('e.g. en')}
  175. class=" text-sm text-right bg-transparent dark:text-gray-300 outline-hidden"
  176. />
  177. </Tooltip>
  178. </div>
  179. </div>
  180. {/if}
  181. <div class=" py-0.5 flex w-full justify-between">
  182. <div class=" self-center text-xs font-medium">
  183. {$i18n.t('Instant Auto-Send After Voice Transcription')}
  184. </div>
  185. <button
  186. class="p-1 px-3 text-xs flex rounded-sm transition"
  187. on:click={() => {
  188. toggleSpeechAutoSend();
  189. }}
  190. type="button"
  191. >
  192. {#if speechAutoSend === true}
  193. <span class="ml-2 self-center">{$i18n.t('On')}</span>
  194. {:else}
  195. <span class="ml-2 self-center">{$i18n.t('Off')}</span>
  196. {/if}
  197. </button>
  198. </div>
  199. </div>
  200. <div>
  201. <div class=" mb-1 text-sm font-medium">{$i18n.t('TTS Settings')}</div>
  202. <div class=" py-0.5 flex w-full justify-between">
  203. <div class=" self-center text-xs font-medium">{$i18n.t('Text-to-Speech Engine')}</div>
  204. <div class="flex items-center relative">
  205. <select
  206. class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 p-1 text-xs bg-transparent outline-hidden text-right"
  207. bind:value={TTSEngine}
  208. placeholder={$i18n.t('Select an engine')}
  209. >
  210. <option value="">{$i18n.t('Default')}</option>
  211. <option value="browser-kokoro">{$i18n.t('Kokoro.js (Browser)')}</option>
  212. </select>
  213. </div>
  214. </div>
  215. {#if TTSEngine === 'browser-kokoro'}
  216. <div class=" py-0.5 flex w-full justify-between">
  217. <div class=" self-center text-xs font-medium">{$i18n.t('Kokoro.js Dtype')}</div>
  218. <div class="flex items-center relative">
  219. <select
  220. class="dark:bg-gray-900 w-fit pr-8 rounded-sm px-2 p-1 text-xs bg-transparent outline-hidden text-right"
  221. bind:value={TTSEngineConfig.dtype}
  222. placeholder={$i18n.t('Select dtype')}
  223. >
  224. <option value="" disabled selected>{$i18n.t('Select dtype')}</option>
  225. <option value="fp32">fp32</option>
  226. <option value="fp16">fp16</option>
  227. <option value="q8">q8</option>
  228. <option value="q4">q4</option>
  229. </select>
  230. </div>
  231. </div>
  232. {/if}
  233. <div class=" py-0.5 flex w-full justify-between">
  234. <div class=" self-center text-xs font-medium">{$i18n.t('Auto-playback response')}</div>
  235. <button
  236. class="p-1 px-3 text-xs flex rounded-sm transition"
  237. on:click={() => {
  238. toggleResponseAutoPlayback();
  239. }}
  240. type="button"
  241. >
  242. {#if responseAutoPlayback === true}
  243. <span class="ml-2 self-center">{$i18n.t('On')}</span>
  244. {:else}
  245. <span class="ml-2 self-center">{$i18n.t('Off')}</span>
  246. {/if}
  247. </button>
  248. </div>
  249. <div class=" py-0.5 flex w-full justify-between">
  250. <div class=" self-center text-xs font-medium">{$i18n.t('Speech Playback Speed')}</div>
  251. <div class="flex items-center relative text-xs px-3">
  252. <input
  253. type="number"
  254. min="0"
  255. step="0.01"
  256. bind:value={playbackRate}
  257. class=" text-sm text-right bg-transparent dark:text-gray-300 outline-hidden"
  258. />
  259. x
  260. </div>
  261. </div>
  262. </div>
  263. <hr class=" border-gray-100 dark:border-gray-850" />
  264. {#if TTSEngine === 'browser-kokoro'}
  265. {#if TTSModel}
  266. <div>
  267. <div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
  268. <div class="flex w-full">
  269. <div class="flex-1">
  270. <input
  271. list="voice-list"
  272. class="w-full text-sm bg-transparent dark:text-gray-300 outline-hidden"
  273. bind:value={voice}
  274. placeholder={$i18n.t('Select a voice')}
  275. />
  276. <datalist id="voice-list">
  277. {#each voices as voice}
  278. <option value={voice.id}>{voice.name}</option>
  279. {/each}
  280. </datalist>
  281. </div>
  282. </div>
  283. </div>
  284. {:else}
  285. <div>
  286. <div class=" mb-2.5 text-sm font-medium flex gap-2 items-center">
  287. <Spinner className="size-4" />
  288. <div class=" text-sm font-medium shimmer">
  289. {$i18n.t('Loading Kokoro.js...')}
  290. {TTSModelProgress && TTSModelProgress.status === 'progress'
  291. ? `(${Math.round(TTSModelProgress.progress * 10) / 10}%)`
  292. : ''}
  293. </div>
  294. </div>
  295. <div class="text-xs text-gray-500">
  296. {$i18n.t('Please do not close the settings page while loading the model.')}
  297. </div>
  298. </div>
  299. {/if}
  300. {:else if $config.audio.tts.engine === ''}
  301. <div>
  302. <div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
  303. <div class="flex w-full">
  304. <div class="flex-1">
  305. <select
  306. class="w-full text-sm bg-transparent dark:text-gray-300 outline-hidden"
  307. bind:value={voice}
  308. >
  309. <option value="" selected={voice !== ''}>{$i18n.t('Default')}</option>
  310. {#each voices.filter((v) => nonLocalVoices || v.localService === true) as _voice}
  311. <option
  312. value={_voice.name}
  313. class="bg-gray-100 dark:bg-gray-700"
  314. selected={voice === _voice.name}>{_voice.name}</option
  315. >
  316. {/each}
  317. </select>
  318. </div>
  319. </div>
  320. <div class="flex items-center justify-between my-1.5">
  321. <div class="text-xs">
  322. {$i18n.t('Allow non-local voices')}
  323. </div>
  324. <div class="mt-1">
  325. <Switch bind:state={nonLocalVoices} />
  326. </div>
  327. </div>
  328. </div>
  329. {:else if $config.audio.tts.engine !== ''}
  330. <div>
  331. <div class=" mb-2.5 text-sm font-medium">{$i18n.t('Set Voice')}</div>
  332. <div class="flex w-full">
  333. <div class="flex-1">
  334. <input
  335. list="voice-list"
  336. class="w-full text-sm bg-transparent dark:text-gray-300 outline-hidden"
  337. bind:value={voice}
  338. placeholder={$i18n.t('Select a voice')}
  339. />
  340. <datalist id="voice-list">
  341. {#each voices as voice}
  342. <option value={voice.id}>{voice.name}</option>
  343. {/each}
  344. </datalist>
  345. </div>
  346. </div>
  347. </div>
  348. {/if}
  349. </div>
  350. <div class="flex justify-end text-sm font-medium">
  351. <button
  352. class="px-3.5 py-1.5 text-sm font-medium bg-black hover:bg-gray-900 text-white dark:bg-white dark:text-black dark:hover:bg-gray-100 transition rounded-full"
  353. type="submit"
  354. >
  355. {$i18n.t('Save')}
  356. </button>
  357. </div>
  358. </form>