From 1860fac87857e73da977385e1339fd4b1d9eced5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 21 Aug 2025 18:31:13 +0000
Subject: [PATCH 1/2] Initial plan


From 43e3ea2340eaea729edcb8a9039880a2d237122d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 21 Aug 2025 18:42:25 +0000
Subject: [PATCH 2/2] Implement OpenAI TTS functionality with UI settings and
 selected text support

Co-authored-by: PeterDaveHello <3691490+PeterDaveHello@users.noreply.github.com>
---
 src/_locales/en/main.json               |   9 +-
 src/_locales/zh-hans/main.json          |   9 +-
 src/components/ReadButton/index.jsx     |  82 +++++++++++++++-
 src/config/index.mjs                    |   6 ++
 src/content-script/menu-tools/index.mjs |  56 +++++++++++
 src/popup/sections/GeneralPart.jsx      |  82 ++++++++++++++++
 src/services/openai-tts.mjs             | 119 ++++++++++++++++++++++++
 7 files changed, 356 insertions(+), 7 deletions(-)
 create mode 100644 src/services/openai-tts.mjs

diff --git a/src/_locales/en/main.json b/src/_locales/en/main.json
index 174f99af..da6f211e 100644
--- a/src/_locales/en/main.json
+++ b/src/_locales/en/main.json
@@ -160,5 +160,12 @@
   "Type": "Type",
   "Mode": "Mode",
   "Custom": "Custom",
-  "Crop Text to ensure the input tokens do not exceed the model's limit": "Crop Text to ensure the input tokens do not exceed the model's limit"
+  "Crop Text to ensure the input tokens do not exceed the model's limit": "Crop Text to ensure the input tokens do not exceed the model's limit",
+  "Text-to-Speech Settings": "Text-to-Speech Settings",
+  "Enable OpenAI TTS (requires API key)": "Enable OpenAI TTS (requires API key)",
+  "TTS Voice": "TTS Voice",
+  "TTS Model": "TTS Model",
+  "TTS Speed": "TTS Speed",
+  "Read Aloud": "Read Aloud",
+  "Read Selected Text": "Read Selected Text"
 }
diff --git a/src/_locales/zh-hans/main.json b/src/_locales/zh-hans/main.json
index 80d06c85..3614ff7d 100644
--- a/src/_locales/zh-hans/main.json
+++ b/src/_locales/zh-hans/main.json
@@ -166,5 +166,12 @@
   "ChatGLM (Emohaa)": "ChatGLM (Emohaa, 专业情绪咨询)",
   "ChatGLM (CharGLM-3)": "ChatGLM (CharGLM-3, 角色扮演)",
   "Crop Text to ensure the input tokens do not exceed the model's limit": "裁剪文本以确保输入token不超过模型限制",
-  "Thinking Content": "思考内容"
+  "Thinking Content": "思考内容",
+  "Text-to-Speech Settings": "语音朗读设置",
+  "Enable OpenAI TTS (requires API key)": "启用 OpenAI TTS (需要 API 密钥)",
+  "TTS Voice": "TTS 声音",
+  "TTS Model": "TTS 模型",
+  "TTS Speed": "朗读速度",
+  "Read Aloud": "朗读",
+  "Read Selected Text": "朗读选中文本"
 }
diff --git a/src/components/ReadButton/index.jsx b/src/components/ReadButton/index.jsx
index 5ca051c2..e90cc41b 100644
--- a/src/components/ReadButton/index.jsx
+++ b/src/components/ReadButton/index.jsx
@@ -1,8 +1,9 @@
-import { useState } from 'react'
+import { useState, useEffect } from 'react'
 import { MuteIcon, UnmuteIcon } from '@primer/octicons-react'
 import PropTypes from 'prop-types'
 import { useTranslation } from 'react-i18next'
 import { useConfig } from '../../hooks/use-config.mjs'
+import { speakText, isTtsAvailable } from '../../services/openai-tts.mjs'
 
 ReadButton.propTypes = {
   contentFn: PropTypes.func.isRequired,
@@ -15,9 +16,62 @@ const synth = window.speechSynthesis
 function ReadButton({ className, contentFn, size }) {
   const { t } = useTranslation()
   const [speaking, setSpeaking] = useState(false)
+  const [loading, setLoading] = useState(false)
+  const [useOpenAiTts, setUseOpenAiTts] = useState(false)
+  const [currentAudio, setCurrentAudio] = useState(null)
   const config = useConfig()
 
-  const startSpeak = () => {
+  // Check if OpenAI TTS is available on component mount and config changes
+  useEffect(() => {
+    const checkTtsAvailability = async () => {
+      const available = await isTtsAvailable()
+      setUseOpenAiTts(available)
+    }
+    checkTtsAvailability()
+  }, [config.enableOpenAiTts, config.apiKey])
+
+  const startOpenAiTtsSpeak = async () => {
+    try {
+      setLoading(true)
+      setSpeaking(true)
+
+      const text = contentFn()
+      const audio = await speakText(text, {
+        voice: config.openAiTtsVoice,
+        model: config.openAiTtsModel,
+        speed: config.openAiTtsSpeed,
+      })
+
+      setCurrentAudio(audio)
+      setLoading(false)
+
+      // Play the audio
+      await audio.play()
+
+      // Handle audio end
+      audio.onended = () => {
+        setSpeaking(false)
+        setCurrentAudio(null)
+      }
+
+      audio.onerror = () => {
+        setSpeaking(false)
+        setCurrentAudio(null)
+        setLoading(false)
+        console.error('Audio playback error')
+      }
+    } catch (error) {
+      console.error('OpenAI TTS error:', error)
+      setLoading(false)
+      setSpeaking(false)
+      setCurrentAudio(null)
+
+      // Fallback to system TTS on error
+      startSystemTtsSpeak()
+    }
+  }
+
+  const startSystemTtsSpeak = () => {
     synth.cancel()
 
     const text = contentFn()
@@ -46,18 +100,36 @@ function ReadButton({ className, contentFn, size }) {
     setSpeaking(true)
   }
 
+  const startSpeak = () => {
+    if (useOpenAiTts) {
+      startOpenAiTtsSpeak()
+    } else {
+      startSystemTtsSpeak()
+    }
+  }
+
   const stopSpeak = () => {
+    if (currentAudio) {
+      currentAudio.pause()
+      currentAudio.currentTime = 0
+      setCurrentAudio(null)
+    }
     synth.cancel()
     setSpeaking(false)
+    setLoading(false)
   }
 
+  // Show loading state or speaking state
+  const isActive = speaking || loading
+
   return (
     <span
       title={t('Read Aloud')}
-      className={`gpt-util-icon ${className ? className : ''}`}
-      onClick={speaking ? stopSpeak : startSpeak}
+      className={`gpt-util-icon ${className ? className : ''} ${loading ? 'loading' : ''}`}
+      onClick={isActive ? stopSpeak : startSpeak}
+      style={{ opacity: loading ? 0.6 : 1 }}
     >
-      {speaking ? <MuteIcon size={size} /> : <UnmuteIcon size={size} />}
+      {isActive ? <MuteIcon size={size} /> : <UnmuteIcon size={size} />}
     </span>
   )
 }
diff --git a/src/config/index.mjs b/src/config/index.mjs
index fb504aee..e0a40ea7 100644
--- a/src/config/index.mjs
+++ b/src/config/index.mjs
@@ -476,6 +476,12 @@ export const defaultConfig = {
   alwaysPinWindow: false,
   focusAfterAnswer: true,
 
+  // TTS settings
+  enableOpenAiTts: false,
+  openAiTtsVoice: 'alloy',
+  openAiTtsModel: 'tts-1',
+  openAiTtsSpeed: 1.0,
+
   apiKey: '', // openai ApiKey
 
   azureApiKey: '',
diff --git a/src/content-script/menu-tools/index.mjs b/src/content-script/menu-tools/index.mjs
index 0f46392e..04cb50f8 100644
--- a/src/content-script/menu-tools/index.mjs
+++ b/src/content-script/menu-tools/index.mjs
@@ -2,6 +2,7 @@ import { getCoreContentText } from '../../utils/get-core-content-text'
 import Browser from 'webextension-polyfill'
 import { getUserConfig } from '../../config/index.mjs'
 import { openUrl } from '../../utils/open-url'
+import { speakText, isTtsAvailable } from '../../services/openai-tts.mjs'
 
 export const config = {
   newChat: {
@@ -16,6 +17,61 @@ export const config = {
       return `You are an expert summarizer. Carefully analyze the following web page content and provide a concise summary focusing on the key points:\n${getCoreContentText()}`
     },
   },
+  readSelectedText: {
+    label: 'Read Selected Text',
+    action: async (fromBackground) => {
+      console.debug('read selected text action from background', fromBackground)
+
+      const selection = window.getSelection()
+      const selectedText = selection ? selection.toString().trim() : ''
+
+      if (!selectedText) {
+        alert('Please select some text first')
+        return
+      }
+
+      try {
+        const config = await getUserConfig()
+        const useTts = await isTtsAvailable()
+
+        if (useTts) {
+          // Use OpenAI TTS
+          await speakText(selectedText, {
+            voice: config.openAiTtsVoice,
+            model: config.openAiTtsModel,
+            speed: config.openAiTtsSpeed,
+          })
+        } else {
+          // Fallback to system TTS
+          const synth = window.speechSynthesis
+          synth.cancel()
+
+          const utterance = new SpeechSynthesisUtterance(selectedText)
+          const voices = synth.getVoices()
+
+          let voice
+          if (config.preferredLanguage.includes('en') && navigator.language.includes('en'))
+            voice = voices.find((v) => v.name.toLowerCase().includes('microsoft aria'))
+          else if (config.preferredLanguage.includes('zh') || navigator.language.includes('zh'))
+            voice = voices.find((v) => v.name.toLowerCase().includes('xiaoyi'))
+          else if (config.preferredLanguage.includes('ja') || navigator.language.includes('ja'))
+            voice = voices.find((v) => v.name.toLowerCase().includes('nanami'))
+          if (!voice)
+            voice = voices.find((v) => v.lang.substring(0, 2) === config.preferredLanguage)
+          if (!voice) voice = voices.find((v) => v.lang === navigator.language)
+
+          if (voice) utterance.voice = voice
+          utterance.rate = 1
+          utterance.volume = 1
+
+          synth.speak(utterance)
+        }
+      } catch (error) {
+        console.error('Error reading selected text:', error)
+        alert('Error reading selected text: ' + error.message)
+      }
+    },
+  },
   openConversationPage: {
     label: 'Open Conversation Page',
     action: async (fromBackground) => {
diff --git a/src/popup/sections/GeneralPart.jsx b/src/popup/sections/GeneralPart.jsx
index 9af6e542..06fadd37 100644
--- a/src/popup/sections/GeneralPart.jsx
+++ b/src/popup/sections/GeneralPart.jsx
@@ -631,6 +631,88 @@ export function GeneralPart({ config, updateConfig, setTabIndex }) {
         />
         {t("Crop Text to ensure the input tokens do not exceed the model's limit")}
       </label>
+
+      {/* Text-to-Speech Settings */}
+      <br />
+      <fieldset>
+        <legend>{t('Text-to-Speech Settings')}</legend>
+        <label>
+          <input
+            type="checkbox"
+            checked={config.enableOpenAiTts}
+            onChange={(e) => {
+              const checked = e.target.checked
+              updateConfig({ enableOpenAiTts: checked })
+            }}
+          />
+          {t('Enable OpenAI TTS (requires API key)')}
+        </label>
+        {config.enableOpenAiTts && (
+          <>
+            <label>
+              <legend>{t('TTS Voice')}</legend>
+              <select
+                required
+                onChange={(e) => {
+                  const voice = e.target.value
+                  updateConfig({ openAiTtsVoice: voice })
+                }}
+              >
+                <option value="alloy" selected={config.openAiTtsVoice === 'alloy'}>
+                  Alloy
+                </option>
+                <option value="echo" selected={config.openAiTtsVoice === 'echo'}>
+                  Echo
+                </option>
+                <option value="fable" selected={config.openAiTtsVoice === 'fable'}>
+                  Fable
+                </option>
+                <option value="onyx" selected={config.openAiTtsVoice === 'onyx'}>
+                  Onyx
+                </option>
+                <option value="nova" selected={config.openAiTtsVoice === 'nova'}>
+                  Nova
+                </option>
+                <option value="shimmer" selected={config.openAiTtsVoice === 'shimmer'}>
+                  Shimmer
+                </option>
+              </select>
+            </label>
+            <label>
+              <legend>{t('TTS Model')}</legend>
+              <select
+                required
+                onChange={(e) => {
+                  const model = e.target.value
+                  updateConfig({ openAiTtsModel: model })
+                }}
+              >
+                <option value="tts-1" selected={config.openAiTtsModel === 'tts-1'}>
+                  TTS-1 (Standard)
+                </option>
+                <option value="tts-1-hd" selected={config.openAiTtsModel === 'tts-1-hd'}>
+                  TTS-1-HD (High Quality)
+                </option>
+              </select>
+            </label>
+            <label>
+              <legend>{t('TTS Speed')}</legend>
+              <input
+                type="range"
+                min="0.25"
+                max="4.0"
+                step="0.25"
+                value={config.openAiTtsSpeed}
+                onChange={(e) => {
+                  const speed = parseFloat(e.target.value)
+                  updateConfig({ openAiTtsSpeed: speed })
+                }}
+              />
+              <span>{config.openAiTtsSpeed}x</span>
+            </label>
+          </>
+        )}
+      </fieldset>
       <br />
       <div style={{ display: 'flex', gap: '10px' }}>
         <button
diff --git a/src/services/openai-tts.mjs b/src/services/openai-tts.mjs
new file mode 100644
index 00000000..49bd800f
--- /dev/null
+++ b/src/services/openai-tts.mjs
@@ -0,0 +1,119 @@
+/**
+ * OpenAI Text-to-Speech service
+ */
+
+import { getUserConfig } from '../config/index.mjs'
+
+/**
+ * Available OpenAI TTS voices
+ */
+export const TTS_VOICES = {
+  alloy: 'Alloy',
+  echo: 'Echo',
+  fable: 'Fable',
+  onyx: 'Onyx',
+  nova: 'Nova',
+  shimmer: 'Shimmer',
+}
+
+/**
+ * Available OpenAI TTS models
+ */
+export const TTS_MODELS = {
+  'tts-1': 'TTS-1 (Standard)',
+  'tts-1-hd': 'TTS-1-HD (High Quality)',
+}
+
+/**
+ * Generate speech using OpenAI TTS API
+ * @param {string} text - Text to convert to speech
+ * @param {Object} options - TTS options
+ * @returns {Promise<Blob>} Audio blob
+ */
+export async function generateSpeech(text, options = {}) {
+  const config = await getUserConfig()
+
+  if (!config.apiKey) {
+    throw new Error('OpenAI API key is required for TTS functionality')
+  }
+
+  const {
+    voice = config.openAiTtsVoice || 'alloy',
+    model = config.openAiTtsModel || 'tts-1',
+    speed = config.openAiTtsSpeed || 1.0,
+  } = options
+
+  const response = await fetch(`${config.customOpenAiApiUrl}/v1/audio/speech`, {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      Authorization: `Bearer ${config.apiKey}`,
+    },
+    body: JSON.stringify({
+      model,
+      input: text,
+      voice,
+      speed: Math.max(0.25, Math.min(4.0, speed)), // Clamp speed between 0.25 and 4.0
+    }),
+  })
+
+  if (!response.ok) {
+    const errorText = await response.text()
+    throw new Error(`OpenAI TTS API error: ${response.status} ${errorText}`)
+  }
+
+  return await response.blob()
+}
+
+/**
+ * Play audio from blob with proper cleanup
+ * @param {Blob} audioBlob - Audio blob to play
+ * @returns {Promise<HTMLAudioElement>} Audio element promise that resolves when playback starts
+ */
+export function playAudioBlob(audioBlob) {
+  return new Promise((resolve, reject) => {
+    const audioUrl = URL.createObjectURL(audioBlob)
+    const audio = new Audio(audioUrl)
+
+    // Clean up the object URL when audio ends or errors
+    const cleanup = () => {
+      URL.revokeObjectURL(audioUrl)
+    }
+
+    audio.onended = cleanup
+    audio.onerror = (error) => {
+      cleanup()
+      reject(error)
+    }
+
+    audio.oncanplaythrough = () => {
+      resolve(audio)
+    }
+
+    audio.load()
+  })
+}
+
+/**
+ * Generate and play speech using OpenAI TTS
+ * @param {string} text - Text to speak
+ * @param {Object} options - TTS options
+ * @returns {Promise<HTMLAudioElement>} Audio element
+ */
+export async function speakText(text, options = {}) {
+  const audioBlob = await generateSpeech(text, options)
+  return await playAudioBlob(audioBlob)
+}
+
+/**
+ * Check if OpenAI TTS is available and properly configured
+ * @returns {Promise<boolean>}
+ */
+export async function isTtsAvailable() {
+  try {
+    const config = await getUserConfig()
+    return config.enableOpenAiTts && !!config.apiKey
+  } catch (error) {
+    return false
+  }
+}