iOS開發(fā) - Swift使用SFSpeechRecognizer實(shí)現(xiàn)蘋果原生API語音轉(zhuǎn)文字

一、在.plist文件中添加麥克風(fēng)、語音識(shí)別權(quán)限說明

權(quán)限說明

二、代碼實(shí)現(xiàn)

//
//  LKSpeechRecognizer.swift
//  Comic
//
//  Created by 李棒棒 on 2024/1/5.
//

import UIKit
import Foundation
import Speech
import AVFoundation

/// 是否為模擬器
var IS_Simulator:Bool {
#if targetEnvironment(simulator)
return true
#else
return false
#endif
}

enum LKSpeechRecognizerStatus:Int {
    //未開始
    case none
    ///未授權(quán)
    case noAuthorize
    ///識(shí)別中
    case recognizing
    ///識(shí)別結(jié)束
    case recognizeFinished
    ///識(shí)別關(guān)閉(被動(dòng)關(guān)閉)
    case recognizeClose
    ///識(shí)別超時(shí)(超過預(yù)設(shè)靜音時(shí)間(默認(rèn):3s)、主動(dòng)結(jié)束)
    case recognizeMuteTimeout
    ///識(shí)別報(bào)錯(cuò)
    case recognizeError
}

class LKSpeechRecognizer: NSObject {

    static let share = LKSpeechRecognizer()
    
    ///status 狀態(tài)  baseText:識(shí)別結(jié)果, speechText:識(shí)別校驗(yàn)后的結(jié)果
    typealias LKSpeechRecognizerResult = (_ status:LKSpeechRecognizerStatus ,
                                          _ baseText:String?,
                                          _ speechText:String?,
                                          _ error:Error?) -> Void
    
    private var recognizerResult: LKSpeechRecognizerResult? = nil
    
    private var bestText:String? = ""
    private var speakText:String? = ""
    
    //靜音間隔時(shí)間 默認(rèn)3s
    var muteTime:TimeInterval = 3.0
    
    var recognizerStatus:LKSpeechRecognizerStatus = .none
    
    private var timer:Timer? = nil
    private var isHaveInput:Bool = false
    
    private var speechTask:SFSpeechRecognitionTask?
    // 語音識(shí)別器
    private var speechRequest:SFSpeechAudioBufferRecognitionRequest?
    
    private var speechRecognizer:SFSpeechRecognizer = {
        let locale = Locale(identifier: "zh_CN")
        //NSLocale.current
        let sRecognizer:SFSpeechRecognizer = SFSpeechRecognizer(locale: locale)!//設(shè)置識(shí)別語種跟隨系統(tǒng)語言
        return sRecognizer
    }()
    
    private var audioEngine:AVAudioEngine = {
        let aEngine: AVAudioEngine = AVAudioEngine()
        return aEngine
    }()
    
    override init() {
        super.init()
        
        self.speechRecognizer.delegate = self
        //請(qǐng)求權(quán)限
        if IS_Simulator == false {
            //checkAuthorized()
        } else {
            print("模擬器不支持")
        }
        
    }
}

//MARK: -
extension LKSpeechRecognizer {
    
    //開始識(shí)別
    func startRecordSpeech() {
        
        bestText = nil
        speakText = nil
        
        //請(qǐng)求授權(quán)
        requestSpeechAuthorization {[weak self] authorizeStatus in
            guard let self = self else { return }
            
            if authorizeStatus == false {//用戶未授權(quán)
                recognizerStatus = .noAuthorize
                recognizerResult?(.noAuthorize,nil,nil,nil)
                return
            }
            
            requestRecordSpeech()
        }
    }
    
    
    func requestRecordSpeech() {
        
        if speechTask != nil {
            speechTask?.cancel()
        }
        
        bestText = nil
        speakText = nil
        
        //AVAudioSession:音頻會(huì)話,主要用來管理音頻設(shè)置與硬件交互
        //配置音頻會(huì)話
        let audioSession = AVAudioSession.sharedInstance()
        do {
    
            try audioSession.setCategory(AVAudioSession.Category.record)
            try audioSession.setMode(AVAudioSession.Mode.measurement)
            try audioSession.setActive(true, options: AVAudioSession.SetActiveOptions.notifyOthersOnDeactivation)
            
        } catch let error {
            
            print("audioSession properties weren't set because of an error:\(error.localizedDescription)")
            recognizerStatus = .recognizeError
            recognizerResult?(.recognizeError,self.bestText,self.speakText,error)
            
            return
        }
        
        
        speechRequest = SFSpeechAudioBufferRecognitionRequest()
        speechRequest?.contextualStrings = ["data","bank","databank"]
        
        speechRequest?.shouldReportPartialResults = false
        speechRequest?.taskHint = .search
        
        speechRequest?.shouldReportPartialResults = true
        speechTask = speechRecognizer.recognitionTask(with: self.speechRequest!, resultHandler: { [weak self] (result, error) in
            
            guard let self = self else { return }
            
            var isFinished = false
            isFinished = result?.isFinal ?? false
            
            if  result != nil {//有音頻輸入
                
                self.isHaveInput = true
                
                let bestString = result!.bestTranscription.formattedString
                print("bestString:\(bestString)")
                
//                var range = NSRange(location: 0, length: bestString.count)
//                if self.speakText?.count ?? 0 > 0 {
//                    range = NSString(string: bestString).range(of: self.speakText ?? "")
//                }
//
//                print("range:\(range)")
//
//                //let nowString = bestString.substring(from: range.length)
//                var nowString = ""
//                nowString = (bestString as NSString).substring(from: range.location)
//
//                print("bestString:\(bestString) - nowString:\(nowString)")
            
                self.bestText = bestString
                self.speakText = bestString
                
                self.recognizerStatus = .recognizing
                self.recognizerResult?(.recognizing,self.bestText,self.speakText,nil)
                
                //一次識(shí)別結(jié)束后開啟靜默監(jiān)測(cè),2s內(nèi)沒有聲音做結(jié)束邏輯處理
                self.startDetectionSpeech()
            }
            
            if error != nil || isFinished == true {
                
                self.audioEngine.stop()
                self.speechRequest?.endAudio()
                self.speechTask?.cancel()
                
                if self.audioEngine.inputNode.numberOfInputs > 0 {
                    self.audioEngine.inputNode.removeTap(onBus: 0)
                }
                
                if isFinished == true {//結(jié)束
                    self.recognizerStatus = .recognizeFinished
                    self.recognizerResult?(.recognizeFinished,self.bestText,self.speakText,nil)
                    print("轉(zhuǎn)換結(jié)束了")
                }
                
                if let error = error {//報(bào)錯(cuò)了
                    
                    if self.recognizerStatus != .recognizeMuteTimeout {
                        self.recognizerStatus = .recognizeError
                        self.recognizerResult?(.recognizeError,self.bestText,self.speakText,error)
                    }
                }
            }
        })
        
        let inputNode:AVAudioInputNode? = audioEngine.inputNode
        let format = inputNode?.outputFormat(forBus: 0)
        if let inputNode = inputNode {
            
            inputNode.installTap(onBus: 0, bufferSize: 400, format: format, block: { [weak self] buffer, when in
                guard let self = self else { return }
                if let speechRequest = self.speechRequest {
                    speechRequest.append(buffer)
                    self.isHaveInput = false
                }
            })
        }
        
        //準(zhǔn)備
        self.audioEngine.prepare()
        do {
            try self.audioEngine.start()
        } catch let error {
            print("audioEngine couldn't start because of an error:\(error.localizedDescription)")
        }
    }
    
    //MARK: - 關(guān)閉錄音識(shí)別
    func closeRecordSpeech() {
        
        stopDetectionSpeech()
        
        if audioEngine.inputNode.numberOfInputs > 0 {
            audioEngine.inputNode.removeTap(onBus: 0)
        }
        audioEngine.stop()
        audioEngine.reset()
        
        speechRequest?.endAudio()
        
        speechTask?.cancel()
        //speechTask?.finish()
        
        recognizerStatus = .recognizeClose
        recognizerResult?(.recognizeClose,self.bestText,self.speakText,nil)
        
        print("錄音關(guān)閉")
    }
    
    //MARK: - 狀態(tài)及結(jié)果回調(diào)
    func recognizerResult(_ completion: LKSpeechRecognizerResult?) {
        recognizerResult = completion
    }
}

//MARK: - 靜音監(jiān)測(cè)
extension LKSpeechRecognizer {

   private func startDetectionSpeech(){
        
        if let timer = timer {
            if timer.isValid {
                timer.invalidate()
            }
        }
        
        NSLog("開始計(jì)時(shí)檢測(cè)")
        timer = Timer.scheduledTimer(timeInterval: muteTime, target: self, selector: #selector(self.didFinishSpeech), userInfo: nil, repeats: false)
        RunLoop.main.add(timer!, forMode: RunLoop.Mode.common)
    }
    
    private func stopDetectionSpeech() {
        if timer != nil {
            timer?.invalidate()
            timer = nil
            NSLog("結(jié)束計(jì)時(shí)檢測(cè)")
        }
    }
    
    @objc private func didFinishSpeech() {
        
        if isHaveInput == false {
            
            print("檢測(cè)到\(muteTime)s內(nèi)沒有說話")
            
            stopDetectionSpeech()
            
            if audioEngine.inputNode.numberOfInputs > 0 {
                audioEngine.inputNode.removeTap(onBus: 0)
            }
            
            audioEngine.stop()
            audioEngine.reset()
            
            
            speechRequest?.endAudio()
            
           
            speechTask?.cancel()
            
            recognizerStatus = .recognizeMuteTimeout
            recognizerResult?(.recognizeMuteTimeout,self.bestText,self.speakText,nil)
        }
    }
}

//MARK: - SFSpeechRecognizerDelegate
extension LKSpeechRecognizer: SFSpeechRecognizerDelegate {
    //錄音發(fā)生變化
    func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
        print("音頻轉(zhuǎn)化發(fā)生變化:\(speechRecognizer) - \(available)")
    }
}

//申請(qǐng)權(quán)限
extension LKSpeechRecognizer {
    
    //MARK: 檢測(cè)權(quán)限
    func checkAuthorized() {
        requestSpeechAuthorization { authorizeStatus in
            
        }
    }
    
    //MARK: - 申請(qǐng)語音識(shí)別權(quán)限
    func requestSpeechAuthorization(authorize: @escaping (Bool)-> Void ) {
        
        if IS_Simulator == true {
            authorize(false)
            print("模擬器不支持")
            return
        }
        
        //請(qǐng)求權(quán)限
        DispatchQueue.global().async {
            
            SFSpeechRecognizer.requestAuthorization {[weak self] status in
                //SFSpeechRecognizerAuthorizationStatus
                guard let self = self else {return}
                
                DispatchQueue.main.async {
                    let isSpeechAuthorized:Bool = (status == SFSpeechRecognizerAuthorizationStatus.authorized)
                    authorize(isSpeechAuthorized)
                }
                
                switch status {
                case .notDetermined:
                    NSLog("Speech Recognizer Authorization Status-Not Determined")
                case .denied://拒絕
                    NSLog("Speech Recognizer Authorization Status-Denied")
                    DispatchQueue.main.async {
                       
                        let alertController:UIAlertController = UIAlertController(title: "無法訪問語音權(quán)限", message: "請(qǐng)?jiān)趇Phone的\"設(shè)置-隱私-語音識(shí)別\"\"中允許訪問麥克風(fēng)、語音識(shí)別權(quán)限\"", preferredStyle: .alert)
                        
                        alertController.addAction(UIAlertAction(title: "取消", style: .cancel, handler: { alertAction in
                            
                        }))
                        alertController.addAction(UIAlertAction(title: "設(shè)置", style: .default, handler: { alertAction in
                            UIApplication.shared.open(URL(string: UIApplication.openSettingsURLString)!)
                           
                        }))
                        AppDelegate.currentViewController?.present(alertController, animated: true)
                    }
                    
                case .restricted:
                    NSLog("Speech Recognizer Authorization Status-Restricted")
                case .authorized:
                    NSLog("Authorized")
                @unknown default:
                    NSLog("unknown")
                }
            }
        }
    }
    
}

三、調(diào)用方法

開始轉(zhuǎn)換

LKSpeechRecognizer.share.startRecordSpeech()

主動(dòng)關(guān)閉

LKSpeechRecognizer.share.closeRecordSpeech()

轉(zhuǎn)換結(jié)果回調(diào)

LKSpeechRecognizer.share.recognizerResult({ [weak self] (status,baseText,speechText,error) in
            guard let self = self else { return }
            
            print("結(jié)果:\(speechText ?? "")")
            
            if status == .noAuthorize {
                self.statusLab.text = "狀態(tài):用戶麥克風(fēng)未授權(quán)"
            }else if status == .recognizeFinished || status == .recognizeMuteTimeout{
                self.statusLab.text = "狀態(tài):您長(zhǎng)時(shí)間未講話,已停止識(shí)別"
            }else if status == .recognizeClose {
                self.statusLab.text = "狀態(tài):已手動(dòng)停止識(shí)別"
            }else if status == .recognizing {
                self.statusLab.text = "請(qǐng)講話……"
            }else if status == .recognizeError {
                self.statusLab.text =  "狀態(tài)error:\(error?.localizedDescription ?? "")"
            }
    
            self.textView.text = speechText
            
        })

四、實(shí)現(xiàn)效果

效果

五、源碼地址

LKSpeech

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容