CallOverlay.svelte 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. <script lang="ts">
  2. import { settings, showCallOverlay } from '$lib/stores';
  3. import { onMount, tick, getContext } from 'svelte';
  4. import { blobToFile, calculateSHA256, extractSentences, findWordIndices } from '$lib/utils';
  5. import { synthesizeOpenAISpeech, transcribeAudio } from '$lib/apis/audio';
  6. import { toast } from 'svelte-sonner';
  7. import Tooltip from '$lib/components/common/Tooltip.svelte';
  8. const i18n = getContext('i18n');
  9. export let submitPrompt: Function;
  10. export let files;
  11. let loading = false;
  12. let confirmed = false;
  13. let camera = false;
  14. let cameraStream = null;
  15. let assistantSpeaking = false;
  16. let assistantAudio = {};
  17. let assistantAudioIdx = null;
  18. let rmsLevel = 0;
  19. let hasStartedSpeaking = false;
  20. let audioContext;
  21. let analyser;
  22. let dataArray;
  23. let audioElement;
  24. let animationFrameId;
  25. let speechRecognition;
  26. let currentUtterance = null;
  27. let mediaRecorder;
  28. let audioChunks = [];
  29. const MIN_DECIBELS = -45;
  30. const VISUALIZER_BUFFER_LENGTH = 300;
  31. let visualizerData = Array(VISUALIZER_BUFFER_LENGTH).fill(0);
  32. const startAudio = () => {
  33. audioContext = new (window.AudioContext || window.webkitAudioContext)();
  34. analyser = audioContext.createAnalyser();
  35. const source = audioContext.createMediaElementSource(audioElement);
  36. source.connect(analyser);
  37. analyser.connect(audioContext.destination);
  38. analyser.fftSize = 32; // Adjust the fftSize
  39. dataArray = new Uint8Array(analyser.frequencyBinCount);
  40. visualize();
  41. };
  42. const visualize = () => {
  43. analyser.getByteFrequencyData(dataArray);
  44. div1Height = dataArray[1] / 2;
  45. div2Height = dataArray[3] / 2;
  46. div3Height = dataArray[5] / 2;
  47. div4Height = dataArray[7] / 2;
  48. animationFrameId = requestAnimationFrame(visualize);
  49. };
  50. // Function to calculate the RMS level from time domain data
  51. const calculateRMS = (data: Uint8Array) => {
  52. let sumSquares = 0;
  53. for (let i = 0; i < data.length; i++) {
  54. const normalizedValue = (data[i] - 128) / 128; // Normalize the data
  55. sumSquares += normalizedValue * normalizedValue;
  56. }
  57. return Math.sqrt(sumSquares / data.length);
  58. };
  59. const normalizeRMS = (rms) => {
  60. rms = rms * 10;
  61. const exp = 1.5; // Adjust exponent value; values greater than 1 expand larger numbers more and compress smaller numbers more
  62. const scaledRMS = Math.pow(rms, exp);
  63. // Scale between 0.01 (1%) and 1.0 (100%)
  64. return Math.min(1.0, Math.max(0.01, scaledRMS));
  65. };
  66. const analyseAudio = (stream) => {
  67. const audioContext = new AudioContext();
  68. const audioStreamSource = audioContext.createMediaStreamSource(stream);
  69. const analyser = audioContext.createAnalyser();
  70. analyser.minDecibels = MIN_DECIBELS;
  71. audioStreamSource.connect(analyser);
  72. const bufferLength = analyser.frequencyBinCount;
  73. const domainData = new Uint8Array(bufferLength);
  74. const timeDomainData = new Uint8Array(analyser.fftSize);
  75. let lastSoundTime = Date.now();
  76. hasStartedSpeaking = false;
  77. const detectSound = () => {
  78. const processFrame = () => {
  79. if (!mediaRecorder || !$showCallOverlay) {
  80. if (mediaRecorder) {
  81. mediaRecorder.stop();
  82. }
  83. return;
  84. }
  85. analyser.getByteTimeDomainData(timeDomainData);
  86. analyser.getByteFrequencyData(domainData);
  87. // Calculate RMS level from time domain data
  88. rmsLevel = calculateRMS(timeDomainData);
  89. // Check if initial speech/noise has started
  90. const hasSound = domainData.some((value) => value > 0);
  91. if (hasSound) {
  92. stopAllAudio();
  93. hasStartedSpeaking = true;
  94. lastSoundTime = Date.now();
  95. }
  96. // Start silence detection only after initial speech/noise has been detected
  97. if (hasStartedSpeaking) {
  98. if (Date.now() - lastSoundTime > 2000) {
  99. confirmed = true;
  100. if (mediaRecorder) {
  101. mediaRecorder.stop();
  102. }
  103. }
  104. }
  105. window.requestAnimationFrame(processFrame);
  106. };
  107. window.requestAnimationFrame(processFrame);
  108. };
  109. detectSound();
  110. };
  111. const stopAllAudio = () => {
  112. if (currentUtterance) {
  113. speechSynthesis.cancel();
  114. currentUtterance = null;
  115. }
  116. if (assistantAudio[assistantAudioIdx]) {
  117. assistantAudio[assistantAudioIdx].pause();
  118. assistantAudio[assistantAudioIdx].currentTime = 0;
  119. }
  120. const audioElement = document.getElementById('audioElement');
  121. audioElement.pause();
  122. audioElement.currentTime = 0;
  123. assistantSpeaking = false;
  124. };
  125. const playAudio = (idx) => {
  126. return new Promise((res) => {
  127. assistantAudioIdx = idx;
  128. const audioElement = document.getElementById('audioElement');
  129. const audio = assistantAudio[idx];
  130. audioElement.src = audio.src; // Assume `assistantAudio` has objects with a `src` property
  131. audioElement.play();
  132. audioElement.onended = async (e) => {
  133. await new Promise((r) => setTimeout(r, 300));
  134. if (Object.keys(assistantAudio).length - 1 === idx) {
  135. assistantSpeaking = false;
  136. }
  137. res(e);
  138. };
  139. });
  140. };
  141. const getOpenAISpeech = async (text) => {
  142. const res = await synthesizeOpenAISpeech(
  143. localStorage.token,
  144. $settings?.audio?.speaker ?? 'alloy',
  145. text,
  146. $settings?.audio?.model ?? 'tts-1'
  147. ).catch((error) => {
  148. toast.error(error);
  149. assistantSpeaking = false;
  150. return null;
  151. });
  152. if (res) {
  153. const blob = await res.blob();
  154. const blobUrl = URL.createObjectURL(blob);
  155. const audio = new Audio(blobUrl);
  156. assistantAudio = audio;
  157. }
  158. };
  159. const transcribeHandler = async (audioBlob) => {
  160. // Create a blob from the audio chunks
  161. await tick();
  162. const file = blobToFile(audioBlob, 'recording.wav');
  163. const res = await transcribeAudio(localStorage.token, file).catch((error) => {
  164. toast.error(error);
  165. return null;
  166. });
  167. if (res) {
  168. console.log(res.text);
  169. if (res.text !== '') {
  170. const _responses = await submitPrompt(res.text);
  171. console.log(_responses);
  172. if (_responses.at(0)) {
  173. const content = _responses[0];
  174. if (content) {
  175. assistantSpeakingHandler(content);
  176. }
  177. }
  178. }
  179. }
  180. };
  181. const assistantSpeakingHandler = async (content) => {
  182. assistantSpeaking = true;
  183. if (($settings?.audio?.TTSEngine ?? '') == '') {
  184. currentUtterance = new SpeechSynthesisUtterance(content);
  185. speechSynthesis.speak(currentUtterance);
  186. } else if ($settings?.audio?.TTSEngine === 'openai') {
  187. console.log('openai');
  188. const sentences = extractSentences(content).reduce((mergedTexts, currentText) => {
  189. const lastIndex = mergedTexts.length - 1;
  190. if (lastIndex >= 0) {
  191. const previousText = mergedTexts[lastIndex];
  192. const wordCount = previousText.split(/\s+/).length;
  193. if (wordCount < 2) {
  194. mergedTexts[lastIndex] = previousText + ' ' + currentText;
  195. } else {
  196. mergedTexts.push(currentText);
  197. }
  198. } else {
  199. mergedTexts.push(currentText);
  200. }
  201. return mergedTexts;
  202. }, []);
  203. console.log(sentences);
  204. let lastPlayedAudioPromise = Promise.resolve(); // Initialize a promise that resolves immediately
  205. for (const [idx, sentence] of sentences.entries()) {
  206. const res = await synthesizeOpenAISpeech(
  207. localStorage.token,
  208. $settings?.audio?.speaker,
  209. sentence,
  210. $settings?.audio?.model
  211. ).catch((error) => {
  212. toast.error(error);
  213. assistantSpeaking = false;
  214. return null;
  215. });
  216. if (res) {
  217. const blob = await res.blob();
  218. const blobUrl = URL.createObjectURL(blob);
  219. const audio = new Audio(blobUrl);
  220. assistantAudio[idx] = audio;
  221. lastPlayedAudioPromise = lastPlayedAudioPromise.then(() => playAudio(idx));
  222. }
  223. }
  224. }
  225. };
  226. const stopRecordingCallback = async () => {
  227. if ($showCallOverlay) {
  228. if (confirmed) {
  229. loading = true;
  230. if (cameraStream) {
  231. const imageUrl = takeScreenshot();
  232. files = [
  233. ...files,
  234. {
  235. type: 'image',
  236. url: imageUrl
  237. }
  238. ];
  239. }
  240. const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
  241. await transcribeHandler(audioBlob);
  242. confirmed = false;
  243. loading = false;
  244. }
  245. audioChunks = [];
  246. mediaRecorder = false;
  247. startRecording();
  248. } else {
  249. audioChunks = [];
  250. mediaRecorder = false;
  251. }
  252. };
  253. const startRecording = async () => {
  254. const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  255. mediaRecorder = new MediaRecorder(stream);
  256. mediaRecorder.onstart = () => {
  257. console.log('Recording started');
  258. audioChunks = [];
  259. analyseAudio(stream);
  260. };
  261. mediaRecorder.ondataavailable = (event) => {
  262. if (hasStartedSpeaking) {
  263. audioChunks.push(event.data);
  264. }
  265. };
  266. mediaRecorder.onstop = async () => {
  267. console.log('Recording stopped');
  268. await stopRecordingCallback();
  269. };
  270. mediaRecorder.start();
  271. };
  272. const startCamera = async () => {
  273. if (cameraStream === null) {
  274. camera = true;
  275. await tick();
  276. try {
  277. const video = document.getElementById('camera-feed');
  278. if (video) {
  279. cameraStream = await navigator.mediaDevices.getUserMedia({ video: true });
  280. video.srcObject = cameraStream;
  281. await video.play();
  282. }
  283. } catch (err) {
  284. console.error('Error accessing webcam: ', err);
  285. }
  286. }
  287. };
  288. const takeScreenshot = () => {
  289. const video = document.getElementById('camera-feed');
  290. const canvas = document.getElementById('camera-canvas');
  291. if (!canvas) {
  292. return;
  293. }
  294. const context = canvas.getContext('2d');
  295. // Make the canvas match the video dimensions
  296. canvas.width = video.videoWidth;
  297. canvas.height = video.videoHeight;
  298. // Draw the flipped image from the video onto the canvas
  299. context.save();
  300. context.scale(-1, 1); // Flip horizontally
  301. context.drawImage(video, 0, 0, video.videoWidth * -1, video.videoHeight);
  302. context.restore();
  303. // Convert the canvas to a data base64 URL and console log it
  304. const dataURL = canvas.toDataURL('image/png');
  305. console.log(dataURL);
  306. return dataURL;
  307. };
  308. const stopCamera = () => {
  309. if (cameraStream) {
  310. const tracks = cameraStream.getTracks();
  311. tracks.forEach((track) => track.stop());
  312. }
  313. cameraStream = null;
  314. camera = false;
  315. };
  316. $: if ($showCallOverlay) {
  317. startRecording();
  318. } else {
  319. stopCamera();
  320. }
  321. </script>
  322. {#if $showCallOverlay}
  323. <audio id="audioElement" src="" style="display: none;" />
  324. <div class=" absolute w-full h-full flex z-[999]">
  325. <div
  326. class="absolute w-full h-full bg-white text-gray-700 dark:bg-black dark:text-gray-300 flex justify-center"
  327. >
  328. <div class="max-w-lg w-full h-screen max-h-[100dvh] flex flex-col justify-between p-6">
  329. {#if camera}
  330. <div class="flex justify-center items-center pt-2 w-full h-20">
  331. {#if loading}
  332. <svg
  333. class="size-12 text-gray-900 dark:text-gray-400"
  334. viewBox="0 0 24 24"
  335. fill="currentColor"
  336. xmlns="http://www.w3.org/2000/svg"
  337. ><style>
  338. .spinner_qM83 {
  339. animation: spinner_8HQG 1.05s infinite;
  340. }
  341. .spinner_oXPr {
  342. animation-delay: 0.1s;
  343. }
  344. .spinner_ZTLf {
  345. animation-delay: 0.2s;
  346. }
  347. @keyframes spinner_8HQG {
  348. 0%,
  349. 57.14% {
  350. animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
  351. transform: translate(0);
  352. }
  353. 28.57% {
  354. animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
  355. transform: translateY(-6px);
  356. }
  357. 100% {
  358. transform: translate(0);
  359. }
  360. }
  361. </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle
  362. class="spinner_qM83 spinner_oXPr"
  363. cx="12"
  364. cy="12"
  365. r="3"
  366. /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
  367. >
  368. {:else}
  369. <div
  370. class=" {rmsLevel * 100 > 4
  371. ? ' size-[4.5rem]'
  372. : rmsLevel * 100 > 2
  373. ? ' size-16'
  374. : rmsLevel * 100 > 1
  375. ? 'size-14'
  376. : 'size-12'} transition-all bg-black dark:bg-white rounded-full"
  377. />
  378. {/if}
  379. <!-- navbar -->
  380. </div>
  381. {/if}
  382. <div class="flex justify-center items-center w-full flex-1">
  383. {#if !camera}
  384. {#if loading}
  385. <svg
  386. class="size-44 text-gray-900 dark:text-gray-400"
  387. viewBox="0 0 24 24"
  388. fill="currentColor"
  389. xmlns="http://www.w3.org/2000/svg"
  390. ><style>
  391. .spinner_qM83 {
  392. animation: spinner_8HQG 1.05s infinite;
  393. }
  394. .spinner_oXPr {
  395. animation-delay: 0.1s;
  396. }
  397. .spinner_ZTLf {
  398. animation-delay: 0.2s;
  399. }
  400. @keyframes spinner_8HQG {
  401. 0%,
  402. 57.14% {
  403. animation-timing-function: cubic-bezier(0.33, 0.66, 0.66, 1);
  404. transform: translate(0);
  405. }
  406. 28.57% {
  407. animation-timing-function: cubic-bezier(0.33, 0, 0.66, 0.33);
  408. transform: translateY(-6px);
  409. }
  410. 100% {
  411. transform: translate(0);
  412. }
  413. }
  414. </style><circle class="spinner_qM83" cx="4" cy="12" r="3" /><circle
  415. class="spinner_qM83 spinner_oXPr"
  416. cx="12"
  417. cy="12"
  418. r="3"
  419. /><circle class="spinner_qM83 spinner_ZTLf" cx="20" cy="12" r="3" /></svg
  420. >
  421. {:else}
  422. <div
  423. class=" {rmsLevel * 100 > 4
  424. ? ' size-52'
  425. : rmsLevel * 100 > 2
  426. ? 'size-48'
  427. : rmsLevel * 100 > 1
  428. ? 'size-[11.5rem]'
  429. : 'size-44'} transition-all bg-black dark:bg-white rounded-full"
  430. />
  431. {/if}
  432. {:else}
  433. <div class="relative video-container w-full h-full py-6 px-2">
  434. <video
  435. id="camera-feed"
  436. autoplay
  437. class="w-full h-full object-cover object-center rounded-2xl"
  438. />
  439. <canvas id="camera-canvas" style="display:none;" />
  440. <div class=" absolute top-8 left-4">
  441. <button
  442. type="button"
  443. class="p-1.5 text-white cursor-pointer backdrop-blur-xl bg-black/10 rounded-full"
  444. on:click={() => {
  445. stopCamera();
  446. }}
  447. >
  448. <svg
  449. xmlns="http://www.w3.org/2000/svg"
  450. viewBox="0 0 16 16"
  451. fill="currentColor"
  452. class="size-6"
  453. >
  454. <path
  455. d="M5.28 4.22a.75.75 0 0 0-1.06 1.06L6.94 8l-2.72 2.72a.75.75 0 1 0 1.06 1.06L8 9.06l2.72 2.72a.75.75 0 1 0 1.06-1.06L9.06 8l2.72-2.72a.75.75 0 0 0-1.06-1.06L8 6.94 5.28 4.22Z"
  456. />
  457. </svg>
  458. </button>
  459. </div>
  460. </div>
  461. {/if}
  462. </div>
  463. <div class="flex justify-between items-center pb-2 w-full">
  464. <div>
  465. <Tooltip content="Camera">
  466. <button
  467. class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900"
  468. type="button"
  469. on:click={() => {
  470. startCamera();
  471. }}
  472. >
  473. <svg
  474. xmlns="http://www.w3.org/2000/svg"
  475. fill="none"
  476. viewBox="0 0 24 24"
  477. stroke-width="1.5"
  478. stroke="currentColor"
  479. class="size-5"
  480. >
  481. <path
  482. stroke-linecap="round"
  483. stroke-linejoin="round"
  484. d="M6.827 6.175A2.31 2.31 0 0 1 5.186 7.23c-.38.054-.757.112-1.134.175C2.999 7.58 2.25 8.507 2.25 9.574V18a2.25 2.25 0 0 0 2.25 2.25h15A2.25 2.25 0 0 0 21.75 18V9.574c0-1.067-.75-1.994-1.802-2.169a47.865 47.865 0 0 0-1.134-.175 2.31 2.31 0 0 1-1.64-1.055l-.822-1.316a2.192 2.192 0 0 0-1.736-1.039 48.774 48.774 0 0 0-5.232 0 2.192 2.192 0 0 0-1.736 1.039l-.821 1.316Z"
  485. />
  486. <path
  487. stroke-linecap="round"
  488. stroke-linejoin="round"
  489. d="M16.5 12.75a4.5 4.5 0 1 1-9 0 4.5 4.5 0 0 1 9 0ZM18.75 10.5h.008v.008h-.008V10.5Z"
  490. />
  491. </svg>
  492. </button>
  493. </Tooltip>
  494. </div>
  495. <div>
  496. <button type="button">
  497. <div class=" line-clamp-1 text-sm font-medium">
  498. {#if loading}
  499. Thinking...
  500. {:else}
  501. Listening...
  502. {/if}
  503. </div>
  504. </button>
  505. </div>
  506. <div>
  507. <button
  508. class=" p-3 rounded-full bg-gray-50 dark:bg-gray-900"
  509. on:click={async () => {
  510. showCallOverlay.set(false);
  511. }}
  512. type="button"
  513. >
  514. <svg
  515. xmlns="http://www.w3.org/2000/svg"
  516. viewBox="0 0 20 20"
  517. fill="currentColor"
  518. class="size-5"
  519. >
  520. <path
  521. d="M6.28 5.22a.75.75 0 0 0-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 1 0 1.06 1.06L10 11.06l3.72 3.72a.75.75 0 1 0 1.06-1.06L11.06 10l3.72-3.72a.75.75 0 0 0-1.06-1.06L10 8.94 6.28 5.22Z"
  522. />
  523. </svg>
  524. </button>
  525. </div>
  526. </div>
  527. </div>
  528. </div>
  529. </div>
  530. {/if}