瀏覽代碼

astra better ui with camera vlm

Alex Cheema 8 月之前
父節點
當前提交
1a419f1f00

+ 2 - 0
examples/astra/astra.xcodeproj/project.pbxproj

@@ -432,6 +432,7 @@
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_NSCameraUsageDescription = "Capture from camera to send to vision model";
 				INFOPLIST_KEY_NSMicrophoneUsageDescription = "Uses your microphone for transcribing audio";
 				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
 				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;
@@ -471,6 +472,7 @@
 				ENABLE_HARDENED_RUNTIME = YES;
 				ENABLE_PREVIEWS = YES;
 				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_NSCameraUsageDescription = "Capture from camera to send to vision model";
 				INFOPLIST_KEY_NSMicrophoneUsageDescription = "Uses your microphone for transcribing audio";
 				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphoneos*]" = YES;
 				"INFOPLIST_KEY_UIApplicationSceneManifest_Generation[sdk=iphonesimulator*]" = YES;

+ 381 - 21
examples/astra/astra/ContentView.swift

@@ -3,6 +3,131 @@ import WhisperKit
 import AVFoundation
 import Foundation
 import Combine
+import Vision
+import AVFAudio
+
+actor CameraActor {
+    let captureSession = AVCaptureSession()
+    private let photoOutput = AVCapturePhotoOutput()
+    private var isConfigured = false
+    private var currentPhotoCaptureDelegate: PhotoCaptureDelegate?
+
+    func configure() throws {
+        guard !isConfigured else {
+            print("Camera already configured")
+            return
+        }
+
+        print("Starting camera configuration")
+
+        guard let camera = AVCaptureDevice.default(for: .video) else {
+            print("No camera device available")
+            throw CameraError.cameraUnavailable
+        }
+
+        do {
+            let input = try AVCaptureDeviceInput(device: camera)
+            print("Camera input created successfully")
+
+            guard captureSession.canAddInput(input) else {
+                print("Cannot add camera input to session")
+                throw CameraError.cannotAddInputOutput
+            }
+
+            guard captureSession.canAddOutput(photoOutput) else {
+                print("Cannot add photo output to session")
+                throw CameraError.cannotAddInputOutput
+            }
+
+            captureSession.beginConfiguration()
+            captureSession.addInput(input)
+            captureSession.addOutput(photoOutput)
+            captureSession.commitConfiguration()
+
+            print("Camera session configured successfully")
+
+            Task.detached { [weak self] in
+                self?.captureSession.startRunning()
+                print("Camera session started running")
+            }
+
+            isConfigured = true
+            print("Camera fully configured and ready")
+        } catch {
+            print("Error during camera configuration: \(error)")
+            throw error
+        }
+    }
+
+    func capturePhoto() async throws -> String {
+        guard isConfigured else {
+            throw CameraError.notConfigured
+        }
+
+        return try await withCheckedThrowingContinuation { continuation in
+            let photoSettings = AVCapturePhotoSettings()
+
+            let delegate = PhotoCaptureDelegate { result in
+                self.currentPhotoCaptureDelegate = nil
+                continuation.resume(with: result)
+            }
+
+            self.currentPhotoCaptureDelegate = delegate
+
+            Task { @MainActor in
+                self.photoOutput.capturePhoto(with: photoSettings, delegate: delegate)
+            }
+        }
+    }
+}
+
+class PhotoCaptureDelegate: NSObject, AVCapturePhotoCaptureDelegate {
+    private let completionHandler: (Result<String, Error>) -> Void
+
+    init(completionHandler: @escaping (Result<String, Error>) -> Void) {
+        self.completionHandler = completionHandler
+    }
+
+    func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
+        if let error = error {
+            completionHandler(.failure(error))
+            return
+        }
+
+        guard let imageData = photo.fileDataRepresentation() else {
+            completionHandler(.failure(CameraError.imageProcessingFailed))
+            return
+        }
+
+        let base64String = imageData.base64EncodedString()
+        completionHandler(.success(base64String))
+    }
+}
+
+enum CameraError: Error {
+    case cameraUnavailable
+    case cannotAddInputOutput
+    case notConfigured
+    case imageProcessingFailed
+}
+
+struct CameraPreview: UIViewRepresentable {
+    let cameraActor: CameraActor
+
+    func makeUIView(context: Context) -> UIView {
+        let view = UIView(frame: .zero)
+        let previewLayer = AVCaptureVideoPreviewLayer(session: cameraActor.captureSession)
+        previewLayer.videoGravity = .resizeAspectFill
+        view.layer.addSublayer(previewLayer)
+        return view
+    }
+
+    func updateUIView(_ uiView: UIView, context: Context) {
+        if let previewLayer = uiView.layer.sublayers?.first as? AVCaptureVideoPreviewLayer {
+            previewLayer.frame = uiView.bounds
+        }
+    }
+}
 
 struct ContentView: View {
     @State private var whisperKit: WhisperKit?
@@ -19,7 +144,7 @@ struct ContentView: View {
     @State private var currentMemo = ""
     @State private var lastVoiceActivityTime = Date()
     @State private var silenceTimer: Timer?
-    @State private var voiceActivityThreshold: Float = 0.33
+    @State private var voiceActivityThreshold: Float = 0.40
     @State private var silenceTimeThreshold = 1.0
     @State private var debugText = ""
     @State private var apiEndpoint = "http://192.168.212.74:8000/v1/chat/completions"
@@ -29,6 +154,24 @@ struct ContentView: View {
     @State private var streamingResponse = ""
     @State private var cancellables = Set<AnyCancellable>()
 
+    @State private var cameraActor: CameraActor?
+    @State private var showLiveCamera = false
+    @State private var capturedImageBase64: String?
+    @State private var errorMessage: String?
+    @State private var isCameraReady = false
+
+    @State private var speechSynthesizer = AVSpeechSynthesizer()
+    @State private var speechBuffer = ""
+    @State private var wordCount = 0
+    let maxWords = 12
+    @State private var originalSilenceThreshold: Float = 0.40
+    @State private var isTTSActive: Bool = false
+    @State private var canRecordAudio: Bool = true
+    @State private var ttsFinishTime: Date?
+
+    @State private var isRequestInProgress = false
+    @State private var isFirst3WordsOfResponse = true
+
     var body: some View {
         VStack {
             Text(currentText)
@@ -62,6 +205,18 @@ struct ContentView: View {
                 .font(.caption)
                 .foregroundColor(.gray)
 
+            Text("TTS Active: \(isTTSActive ? "Yes" : "No")")
+                .font(.caption)
+                .foregroundColor(isTTSActive ? .green : .red)
+
+            Text("Current Silence Threshold: \(voiceActivityThreshold, specifier: "%.2f")")
+                .font(.caption)
+                .foregroundColor(.blue)
+
+            Text("Original Silence Threshold: \(originalSilenceThreshold, specifier: "%.2f")")
+                .font(.caption)
+                .foregroundColor(.orange)
+
             Slider(value: $voiceActivityThreshold, in: 0.01...1.0) {
                 Text("Voice Activity Threshold: \(voiceActivityThreshold, specifier: "%.2f")")
             }
@@ -76,9 +231,52 @@ struct ContentView: View {
             }
             .frame(height: 200)
             .border(Color.gray, width: 1)
+
+            Toggle("Show Live Camera", isOn: $showLiveCamera)
+                .padding()
+                .onChange(of: showLiveCamera) { newValue in
+                    if newValue {
+                        Task {
+                            await setupCamera()
+                        }
+                    } else {
+                        cameraActor = nil
+                        isCameraReady = false
+                        print("Camera disabled")
+                    }
+                }
+
+            if showLiveCamera {
+                if isCameraReady, let actor = cameraActor {
+                    CameraPreview(cameraActor: actor)
+                        .frame(height: 200)
+                        .cornerRadius(10)
+                        .padding()
+
+                    Button("Capture Photo") {
+                        Task {
+                            await capturePhoto()
+                        }
+                    }
+                    .padding()
+                } else {
+                    ProgressView("Initializing camera...")
+                        .padding()
+                }
+            }
+
+            Text("Camera Ready: \(isCameraReady ? "Yes" : "No")")
+                .padding()
+
+            if let errorMessage = errorMessage {
+                Text("Error: \(errorMessage)")
+                    .foregroundColor(.red)
+                    .padding()
+            }
         }
         .onAppear {
             setupWhisperKit()
+            startTTSMonitoring()
         }
     }
 
@@ -88,14 +286,48 @@ struct ContentView: View {
                 whisperKit = try await WhisperKit(verbose: true)
                 print("WhisperKit initialized successfully")
                 startListening()
-                startAudioBuffering() // Add this line
+                startAudioBuffering()
             } catch {
                 print("Error initializing WhisperKit: \(error)")
             }
         }
     }
 
-    // Add this new function
+    private func startTTSMonitoring() {
+        Timer.scheduledTimer(withTimeInterval: 0.1, repeats: true) { _ in
+            let newTTSActive = speechSynthesizer.isSpeaking
+            if newTTSActive != isTTSActive {
+                isTTSActive = newTTSActive
+                canRecordAudio = !newTTSActive
+                if isTTSActive {
+                    voiceActivityThreshold = 1.0 // Set to max to prevent recording
+                    whisperKit?.audioProcessor.purgeAudioSamples(keepingLast: 0) // Flush audio buffer
+                    print("TTS Started - Audio recording paused")
+                } else {
+                    ttsFinishTime = Date()
+                    print("TTS Finished - Waiting 0.5 seconds before resuming audio recording")
+                }
+                updateDebugText()
+            }
+
+            if !isTTSActive, let finishTime = ttsFinishTime, Date().timeIntervalSince(finishTime) >= 0.5 {
+                whisperKit?.audioProcessor.purgeAudioSamples(keepingLast: 0) // Flush audio buffer
+                voiceActivityThreshold = originalSilenceThreshold
+                canRecordAudio = true
+                ttsFinishTime = nil
+                print("Audio recording resumed after TTS delay")
+                updateDebugText()
+            }
+        }
+    }
+
+    private func updateDebugText() {
+        debugText += "\nTTS Active: \(isTTSActive)"
+        debugText += "\nCurrent Silence Threshold: \(voiceActivityThreshold)"
+        debugText += "\nOriginal Silence Threshold: \(originalSilenceThreshold)"
+        debugText += "\n---"
+    }
+
     private func startAudioBuffering() {
         Task {
             while true {
@@ -138,7 +370,7 @@ struct ContentView: View {
     }
 
     private func checkVoiceActivity() {
-        guard let audioProcessor = whisperKit?.audioProcessor else { return }
+        guard canRecordAudio, let audioProcessor = whisperKit?.audioProcessor else { return }
 
         let voiceDetected = AudioProcessor.isVoiceDetected(
             in: audioProcessor.relativeEnergy,
@@ -146,7 +378,6 @@ struct ContentView: View {
             silenceThreshold: Float(voiceActivityThreshold)
         )
 
-        // Debug logging
         let energyValuesToConsider = Int(Float(bufferSeconds) / 0.1)
         let nextBufferEnergies = audioProcessor.relativeEnergy.suffix(energyValuesToConsider)
         let numberOfValuesToCheck = max(10, nextBufferEnergies.count - 10)
@@ -190,7 +421,6 @@ struct ContentView: View {
                 saveMemoToFile(currentMemo)
                 currentMemo = ""
             }
-            // Flush the transcribed text and reset audio samples
             currentText = ""
             whisperKit?.audioProcessor.purgeAudioSamples(keepingLast: 0)
             print("Ended memo")
@@ -213,7 +443,7 @@ struct ContentView: View {
     private func transcribeInRealTime() {
         Task {
             while isRecordingMemo {
-                if let samples = whisperKit?.audioProcessor.audioSamples, samples.count > WhisperKit.sampleRate {
+                if canRecordAudio, let samples = whisperKit?.audioProcessor.audioSamples, samples.count > WhisperKit.sampleRate {
                     do {
                         let samplesToTranscribe: [Float]
                         if isInitialTranscription {
@@ -222,7 +452,7 @@ struct ContentView: View {
                         } else {
                             samplesToTranscribe = Array(samples)
                         }
-                        
+
                         let result = try await whisperKit?.transcribe(audioArray: samplesToTranscribe)
                         await MainActor.run {
                             let newText = result?.first?.text ?? ""
@@ -259,12 +489,85 @@ struct ContentView: View {
             print("Error saving memo: \(error)")
         }
 
-        // After saving to file, send HTTP request
-        sendMemoToAPI(memo)
+        Task {
+            if !isCameraReady {
+                print("Camera not ready, initializing...")
+                await setupCamera()
+            }
+
+            if let imageBase64 = await capturePhotoBase64() {
+                sendMemoToAPI(memo, imageBase64: imageBase64)
+            } else {
+                sendMemoToAPI(memo)
+            }
+        }
+    }
+
+    private func setupCamera() async {
+        print("Setting up camera...")
+        do {
+            let actor = CameraActor()
+            print("CameraActor instance created")
+            try await actor.configure()
+            print("Camera configured successfully")
+            await MainActor.run {
+                self.cameraActor = actor
+                self.errorMessage = nil
+                self.isCameraReady = true
+                print("Camera setup complete, UI updated")
+            }
+        } catch {
+            print("Camera setup failed: \(error)")
+            await MainActor.run {
+                self.errorMessage = "Failed to initialize camera: \(error.localizedDescription)"
+                self.isCameraReady = false
+                print("Camera setup failure reflected in UI")
+            }
+        }
+    }
+
+    private func capturePhotoBase64() async -> String? {
+        print("Attempting to capture photo...")
+        if !isCameraReady {
+            print("Camera not ready, attempting to initialize...")
+            await setupCamera()
+        }
+
+        guard let actor = cameraActor, isCameraReady else {
+            print("Camera not initialized or not ready, cannot capture photo")
+            await MainActor.run {
+                self.errorMessage = "Camera not initialized or not ready"
+            }
+            return nil
+        }
+
+        do {
+            let base64String = try await actor.capturePhoto()
+            print("Photo captured successfully")
+            await MainActor.run {
+                self.errorMessage = nil
+            }
+            return base64String
+        } catch {
+            print("Error capturing photo: \(error)")
+            await MainActor.run {
+                self.errorMessage = "Failed to capture photo: \(error.localizedDescription)"
+            }
+            return nil
+        }
     }
 
-    private func sendMemoToAPI(_ memo: String) {
+    private func sendMemoToAPI(_ memo: String, imageBase64: String? = nil) {
         Task {
+            guard !isRequestInProgress else {
+                print("A request is already in progress. Skipping this one.")
+                return
+            }
+
+            isRequestInProgress = true
+            isFirst3WordsOfResponse = true  // Reset for new request
+            defer { isRequestInProgress = false }
+
             do {
                 print("Starting API request for memo: \(memo.prefix(50))...")
 
@@ -273,21 +576,25 @@ struct ContentView: View {
                     return
                 }
 
-                let payload: [String: Any] = [
+                var payload: [String: Any] = [
                     "model": "llava-1.5-7b-hf",
                     "messages": [
-                        ["role": "system", "content": ["type": "text", "text": "You are a helpful chat assistant being used with Whisper voice transcription. Please assist the user with their queries."]],
-                        ["role": "user", "content": ["type": "text", "text": memo]]
+                        ["role": "user", "content": [
+                            ["type": "text", "text": "You are a helpful conversational assistant chatting with a Gen Z user using their iPhone for voice transcription and sending images to you with their iPhone camera. Be conversational and concise, with a laid back attitude and be cheerful with humour. User said: " + memo],
+                        ]]
                     ],
                     "temperature": 0.7,
                     "stream": true
                 ]
-                // let payload: [String: Any] = [
-                //     "model": "llama-3.1-8b",
-                //     "messages": [["role": "system", "content": "You are a helpful chat assistant being used with Whisper voice transcription. Please assist the user with their queries."], ["role": "user", "content": memo]],
-                //     "temperature": 0.7,
-                //     "stream": true
-                // ]
+
+                if let imageBase64 = imageBase64 {
+                    if var userMessage = (payload["messages"] as? [[String: Any]])?.last,
+                       var content = userMessage["content"] as? [[String: Any]] {
+                        content.append(["type": "image_url", "image_url": ["url": "data:image/jpeg;base64,\(imageBase64)"]])
+                        userMessage["content"] = content
+                        payload["messages"] = [userMessage]
+                    }
+                }
 
                 guard let jsonData = try? JSONSerialization.data(withJSONObject: payload) else {
                     print("Failed to serialize JSON payload")
@@ -301,7 +608,6 @@ struct ContentView: View {
 
                 print("Sending request to \(url.absoluteString)")
 
-                // Reset the streaming response
                 await MainActor.run {
                     self.streamingResponse = ""
                 }
@@ -348,10 +654,38 @@ struct ContentView: View {
             print("Extracted content: \(content)")
             await MainActor.run {
                 self.streamingResponse += content
+                bufferContent(content)
             }
         }
     }
 
+    private func bufferContent(_ content: String) {
+        speechBuffer += content
+        let words = speechBuffer.split(separator: " ")
+        wordCount = words.count
+
+        if isFirst3WordsOfResponse && wordCount >= 3 {
+            isFirst3WordsOfResponse = false
+            speakBufferedContent()
+        } else if content.contains(".") || content.contains("!") || content.contains("?") || wordCount >= maxWords {
+            speakBufferedContent()
+        }
+    }
+
+    private func speakBufferedContent() {
+        guard !speechBuffer.isEmpty else { return }
+        speakContent(speechBuffer)
+        speechBuffer = ""
+        wordCount = 0
+    }
+
+    private func speakContent(_ content: String) {
+        let utterance = AVSpeechUtterance(string: content)
+        utterance.voice = AVSpeechSynthesisVoice(language: "en-US")
+        utterance.rate = 0.5
+        speechSynthesizer.speak(utterance)
+    }
+
     private func loadModel(_ model: String) async throws -> Bool {
         guard let whisperKit = whisperKit else {
             print("WhisperKit instance not initialized")
@@ -372,4 +706,30 @@ struct ContentView: View {
             return false
         }
     }
+
+    private func capturePhoto() async {
+        print("Attempting to capture photo...")
+        print("Camera ready: \(isCameraReady), CameraActor exists: \(cameraActor != nil)")
+        guard let actor = cameraActor, isCameraReady else {
+            print("Camera not initialized or not ready, cannot capture photo")
+            await MainActor.run {
+                self.errorMessage = "Camera not initialized or not ready"
+            }
+            return
+        }
+
+        do {
+            let base64String = try await actor.capturePhoto()
+            print("Photo captured successfully")
+            await MainActor.run {
+                self.capturedImageBase64 = base64String
+                self.errorMessage = nil
+            }
+        } catch {
+            print("Error capturing photo: \(error)")
+            await MainActor.run {
+                self.errorMessage = "Failed to capture photo: \(error.localizedDescription)"
+            }
+        }
+    }
 }

+ 2 - 0
examples/astra/astra/astra.entitlements

@@ -16,5 +16,7 @@
 	<true/>
 	<key>com.apple.security.network.server</key>
 	<true/>
+	<key>com.apple.security.device.camera</key>
+	<true/>
 </dict>
 </plist>