CallOverlay.svelte 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. <script lang="ts">
  2. import { settings, showCallOverlay } from '$lib/stores';
  3. import { onMount, tick, getContext } from 'svelte';
  4. import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
  5. import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
  6. import { toast } from 'svelte-sonner';
  7. import Tooltip from '$lib/components/common/Tooltip.svelte';
  8. const i18n = getContext('i18n');
  9. export let submitPrompt: Function;
  10. let loading = false;
  11. let confirmed = false;
  12. let assistantSpeaking = false;
  13. let assistantAudio = {};
  14. let assistantAudioIdx = null;
  15. let rmsLevel = 0;
  16. let hasStartedSpeaking = false;
  17. let audioContext;
  18. let analyser;
  19. let dataArray;
  20. let audioElement;
  21. let animationFrameId;
  22. let speechRecognition;
  23. let currentUtterance = null;
  24. let mediaRecorder;
  25. let audioChunks = [];
  26. const MIN_DECIBELS = -45;
  27. const VISUALIZER_BUFFER_LENGTH = 300;
  28. let visualizerData = Array(VISUALIZER_BUFFER_LENGTH).fill(0);
  29. const startAudio = () => {
  30. audioContext = new (window.AudioContext || window.webkitAudioContext)();
  31. analyser = audioContext.createAnalyser();
  32. const source = audioContext.createMediaElementSource(audioElement);
  33. source.connect(analyser);
  34. analyser.connect(audioContext.destination);
  35. analyser.fftSize = 32; // Adjust the fftSize
  36. dataArray = new Uint8Array(analyser.frequencyBinCount);
  37. visualize();
  38. };
  39. const visualize = () => {
  40. analyser.getByteFrequencyData(dataArray);
  41. div1Height = dataArray[1] / 2;
  42. div2Height = dataArray[3] / 2;
  43. div3Height = dataArray[5] / 2;
  44. div4Height = dataArray[7] / 2;
  45. animationFrameId = requestAnimationFrame(visualize);
  46. };
  47. // Function to calculate the RMS level from time domain data
  48. const calculateRMS = (data: Uint8Array) => {
  49. let sumSquares = 0;
  50. for (let i = 0; i < data.length; i++) {
  51. const normalizedValue = (data[i] - 128) / 128; // Normalize the data
  52. sumSquares += normalizedValue * normalizedValue;
  53. }
  54. return Math.sqrt(sumSquares / data.length);
  55. };
  56. const normalizeRMS = (rms) => {
  57. rms = rms * 10;
  58. const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
  59. const scaledRMS = Math.pow(rms, exp);
  60. // Scale between 0.01 (1%) and 1.0 (100%)
  61. return Math.min(1.0, Math.max(0.01, scaledRMS));
  62. };
  63. const analyseAudio = (stream) => {
  64. const audioContext = new AudioContext();
  65. const audioStreamSource = audioContext.createMediaStreamSource(stream);
  66. const analyser = audioContext.createAnalyser();
  67. analyser.minDecibels = MIN_DECIBELS;
  68. audioStreamSource.connect(analyser);
  69. const bufferLength = analyser.frequencyBinCount;
  70. const domainData = new Uint8Array(bufferLength);
  71. const timeDomainData = new Uint8Array(analyser.fftSize);
  72. let lastSoundTime = Date.now();
  73. hasStartedSpeaking = false;
  74. const detectSound = () => {
  75. const processFrame = () => {
  76. if (!mediaRecorder || !$showCallOverlay) {
  77. if (mediaRecorder) {
  78. mediaRecorder.stop();
  79. }
  80. return;
  81. }
  82. analyser.getByteTimeDomainData(timeDomainData);
  83. analyser.getByteFrequencyData(domainData);
  84. // Calculate RMS level from time domain data
  85. rmsLevel = calculateRMS(timeDomainData);
  86. // Check if initial speech/noise has started
  87. const hasSound = domainData.some((value) => value > 0);
  88. if (hasSound) {
  89. stopAllAudio();
  90. hasStartedSpeaking = true;
  91. lastSoundTime = Date.now();
  92. }
  93. // Start silence detection only after initial speech/noise has been detected
  94. if (hasStartedSpeaking) {
  95. if (Date.now() - lastSoundTime > 2000) {
  96. confirmed = true;
  97. if (mediaRecorder) {
  98. mediaRecorder.stop();
  99. }
  100. }
  101. }
  102. window.requestAnimationFrame(processFrame);
  103. };
  104. window.requestAnimationFrame(processFrame);
  105. };
  106. detectSound();
  107. };
  108. const stopAllAudio = () => {
  109. if (currentUtterance) {
  110. speechSynthesis.cancel();
  111. currentUtterance = null;
  112. }
  113. if (assistantAudio[assistantAudioIdx]) {
  114. assistantAudio[assistantAudioIdx].pause();
  115. assistantAudio[assistantAudioIdx].currentTime = 0;
  116. }
  117. assistantSpeaking = false;
  118. };
  119. const playAudio = (idx) => {
  120. return new Promise((res) => {
  121. assistantAudioIdx = idx;
  122. const audio = assistantAudio[idx];
  123. audio.play();
  124. audio.onended = async (e) => {
  125. await new Promise((r) => setTimeout(r, 300));
  126. if (Object.keys(assistantAudio).length - 1 === idx) {
  127. assistantSpeaking = false;
  128. }
  129. res(e);
  130. };
  131. });
  132. };
  133. const getOpenAISpeech = async (text) => {
  134. const res = await synthesizeOpenAISpeech(
  135. localStorage.token,
  136. $settings?.audio?.speaker ?? 'alloy',
  137. text,
  138. $settings?.audio?.model ?? 'tts-1'
  139. ).catch((error) => {
  140. toast.error(error);
  141. assistantSpeaking = false;
  142. return null;
  143. });
  144. if (res) {
  145. const blob = await res.blob();
  146. const blobUrl = URL.createObjectURL(blob);
  147. const audio = new Audio(blobUrl);
  148. assistantAudio = audio;
  149. }
  150. };
  151. const transcribeHandler = async (audioBlob) => {
  152. // Create a blob from the audio chunks
  153. await tick();
  154. const file = blobToFile(audioBlob, 'recording.wav');
  155. const res = await transcribeAudio(localStorage.token, file).catch((error) => {
  156. toast.error(error);
  157. return null;
  158. });
  159. if (res) {
  160. console.log(res.text);
  161. if (res.text !== '') {
  162. const _responses = await submitPrompt(res.text);
  163. console.log(_responses);
  164. if (_responses.at(0)) {
  165. const content = _responses[0];
  166. if (content) {
  167. assistantSpeakingHandler(content);
  168. }
  169. }
  170. }
  171. }
  172. };
  173. const assistantSpeakingHandler = async (content) => {
  174. assistantSpeaking = true;
  175. if (($settings?.audio?.TTSEngine ?? '') == '') {
  176. currentUtterance = new SpeechSynthesisUtterance(content);
  177. speechSynthesis.speak(currentUtterance);
  178. } else if ($settings?.audio?.TTSEngine === 'openai') {
  179. console.log('openai');
  180. const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
  181. const lastIndex = mergedTexts.length - 1;
  182. if (lastIndex >= 0) {
  183. const previousText = mergedTexts[lastIndex];
  184. const wordCount = previousText.split(/\s+/).length;
  185. if (wordCount < 2) {
  186. mergedTexts[lastIndex] = previousText + ' ' + currentText;
  187. } else {
  188. mergedTexts.push(currentText);
  189. }
  190. } else {
  191. mergedTexts.push(currentText);
  192. }
  193. return mergedTexts;
  194. }, []);
  195. console.log(sentences);
  196. let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
  197. for (const [idx, sentence] of sentences.entries()) {
  198. const res = await synthesizeOpenAISpeech(
  199. localStorage.token,
  200. $settings?.audio?.speaker,
  201. sentence,
  202. $settings?.audio?.model
  203. ).catch((error) => {
  204. toast.error(error);
  205. assistantSpeaking = false;
  206. return null;
  207. });
  208. if (res) {
  209. const blob = await res.blob();
  210. const blobUrl = URL.createObjectURL(blob);
  211. const audio = new Audio(blobUrl);
  212. assistantAudio[idx] = audio;
  213. lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
  214. }
  215. }
  216. }
  217. };
  218. const stopRecordingCallback = async () => {
  219. if ($showCallOverlay) {
  220. if (confirmed) {
  221. loading = true;
  222. const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
  223. await transcribeHandler(audioBlob);
  224. confirmed = false;
  225. loading = false;
  226. }
  227. audioChunks = [];
  228. mediaRecorder = false;
  229. startRecording();
  230. } else {
  231. audioChunks = [];
  232. mediaRecorder = false;
  233. }
  234. };
  235. const startRecording = async () => {
  236. const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  237. mediaRecorder = new MediaRecorder(stream);
  238. mediaRecorder.onstart = () => {
  239. console.log('Recording started');
  240. audioChunks = [];
  241. analyseAudio(stream);
  242. };
  243. mediaRecorder.ondataavailable = (event) => {
  244. if (hasStartedSpeaking) {
  245. audioChunks.push(event.data);
  246. }
  247. };
  248. mediaRecorder.onstop = async () => {
  249. console.log('Recording stopped');
  250. await stopRecordingCallback();
  251. };
  252. mediaRecorder.start();
  253. };
  254. $: if ($showCallOverlay) {
  255. startRecording();
  256. }
  257. </script>
  258. {#if $showCallOverlay}
  259. <div class=" absolute w-full h-full flex z-[999]">
  260. <div
  261. class="absolute w-full h-full bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"
  262. >
  263. <div class="max-w-lg w-full h-screen flex flex-col justify-between p-6">
  264. <div>
  265. <!-- navbar -->
  266. </div>
  267. <div class="flex justify-center items-center w-ull">
  268. {#if loading}
  269. <svg
  270. class="size-44 text-gray-900 dark:text-gray-400"
  271. viewBox="0 0 24 24"
  272. fill="currentColor"
  273. xmlns="http://www.w3.org/2000/svg"
  274. ><style>
  275. .spinner_qM83 {
  276. animation: spinner_8HQG 1.05s infinite;
  277. }
  278. .spinner_oXPr {
  279. animation-delay: 0.1s;
  280. }
  281. .spinner_ZTLf {
  282. animation-delay: 0.2s;
  283. }
  284. @keyframes spinner_8HQG {
  285. 0%,
  286. 57.14% {
  287. animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
  288. transform: translate(0);
  289. }
  290. 28.57% {
  291. animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
  292. transform: translateY(-6px);
  293. }
  294. 100% {
  295. transform: translate(0);
  296. }
  297. }
  298. </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle
  299. class="spinner_qM83 spinner_oXPr"
  300. cx="12"
  301. cy="12"
  302. r="3"
  303. /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
  304. >
  305. {:else}
  306. <div
  307. class=" {rmsLevel * 100 > 4
  308. ? ' size-52'
  309. : rmsLevel * 100 > 2
  310. ? 'size-48'
  311. : rmsLevel * 100 > 1
  312. ? 'size-[11.5rem]'
  313. : 'size-44'} transition-all bg-black dark:bg-white rounded-full"
  314. />
  315. {/if}
  316. </div>
  317. <div class="flex justify-between items-center pb-2 w-full">
  318. <div>
  319. <Tooltip content="WIP 🚧">
  320. <button class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900">
  321. <svg
  322. xmlns="http://www.w3.org/2000/svg"
  323. fill="none"
  324. viewBox="0 0 24 24"
  325. stroke-width="1.5"
  326. stroke="currentColor"
  327. class="size-5"
  328. >
  329. <path
  330. stroke-linecap="round"
  331. stroke-linejoin="round"
  332. d="M6.827 6.175A2.31 2.31 0 0 1 5.186 7.23c-.38.054-.757.112-1.134.175C2.999 7.58 2.25 8.507 2.25 9.574V18a2.25 2.25 0 0 0 2.25 2.25h15A2.25 2.25 0 0 0 21.75 18V9.574c0-1.067-.75-1.994-1.802-2.169a47.865 47.865 0 0 0-1.134-.175 2.31 2.31 0 0 1-1.64-1.055l-.822-1.316a2.192 2.192 0 0 0-1.736-1.039 48.774 48.774 0 0 0-5.232 0 2.192 2.192 0 0 0-1.736 1.039l-.821 1.316Z"
  333. />
  334. <path
  335. stroke-linecap="round"
  336. stroke-linejoin="round"
  337. d="M16.5 12.75a4.5 4.5 0 1 1-9 0 4.5 4.5 0 0 1 9 0ZM18.75 10.5h.008v.008h-.008V10.5Z"
  338. />
  339. </svg>
  340. </button>
  341. </Tooltip>
  342. </div>
  343. <div>
  344. <button type="button">
  345. <div class=" line-clamp-1 text-sm font-medium">
  346. {#if loading}
  347. Thinking...
  348. {:else}
  349. Listening...
  350. {/if}
  351. </div>
  352. </button>
  353. </div>
  354. <div>
  355. <button
  356. class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900"
  357. on:click={async () => {
  358. showCallOverlay.set(false);
  359. }}
  360. type="button"
  361. >
  362. <svg
  363. xmlns="http://www.w3.org/2000/svg"
  364. viewBox="0 0 20 20"
  365. fill="currentColor"
  366. class="size-5"
  367. >
  368. <path
  369. d="M6.28 5.22a.75.75 0 0 0-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 1 0 1.06 1.06L10 11.06l3.72 3.72a.75.75 0 1 0 1.06-1.06L11.06 10l3.72-3.72a.75.75 0 0 0-1.06-1.06L10 8.94 6.28 5.22Z"
  370. />
  371. </svg>
  372. </button>
  373. </div>
  374. </div>
  375. </div>
  376. </div>
  377. </div>
  378. {/if}