Implement good-enough TTS without machine learning.

2025-07-29 15:56:44 -04:00 · 2025-07-29 15:56:44 -04:00 · cd43d98fb6
commit cd43d98fb6
parent 8262d78b54
7 changed files with 116 additions and 122 deletions
--- a/cmd/joyful/main.go
+++ b/cmd/joyful/main.go
@ -63,21 +63,18 @@ func initPhysicalDevices(config *config.ConfigParser) map[string]*evdev.InputDev
 func main() {
 	// parse command-line
 	var configFlag string
-	var ttsVoiceFlag string
-	var ttsFlag bool
+	flag.BoolVarP(&logger.IsDebugMode, "debug", "d", false, "Output very verbose debug messages.")
 	flag.StringVarP(&configFlag, "config", "c", "~/.config/joyful", "Directory to read configuration from.")
-	addTTSFlags(&ttsFlag, &ttsVoiceFlag)
+	ttsOps := addTTSFlags()
 	flag.Parse()

 	// parse configs
 	configDir := getConfigDir(configFlag)
 	config := readConfig(configDir)

-	tts, err := newTTS(ttsFlag, ttsVoiceFlag)
+	// initialize TTS
+	tts, err := newTTS(ttsOps)
 	logger.LogIfError(err, "Failed to initialize TTS")
-	if tts != nil {
-		defer tts.Cleanup()
-	}

 	// Initialize virtual devices with event buffers
 	vBuffersByName, vBuffersByDevice := initVirtualBuffers(config)
@ -91,6 +88,12 @@ func main() {
 	// initialize the mode variable
 	mode := config.GetModes()[0]

+	// initialize TTS phrases for modes
+	for _, m := range config.GetModes() {
+		tts.AddMessage(m)
+		logger.LogDebugf("Added TTS message '%s'", m)
+	}
+
 	fmt.Println("Joyful Running! Press Ctrl+C to quit. Press Enter to reload rules.")
 	if len(config.GetModes()) > 1 {
 		logger.Logf("Initial mode set to '%s'", mode)
--- a/cmd/joyful/tts.go
+++ b/cmd/joyful/tts.go
@ -1,65 +1,51 @@
-//go:build !notts
-
 package main

 import (
 	"bytes"
-	"io"
-	"os"
+	"fmt"
+	"os/exec"
+	"strconv"
 	"time"

 	"git.annabunches.net/annabunches/joyful/internal/logger"
-	"github.com/amitybell/piper"
-	asset "github.com/amitybell/piper-asset"
-	alan "github.com/amitybell/piper-voice-alan"
-	jenny "github.com/amitybell/piper-voice-jenny"
 	"github.com/ebitengine/oto/v3"
 	flag "github.com/spf13/pflag"
 )

+type TTSOptions struct {
+	Disabled bool
+	Voice    string
+	Volume   int
+	Pitch    int
+	Range    int
+	Speed    int
+}
+
 type TTS struct {
-	piper.TTS
-	dataDir string
+	options *TTSOptions
 	otoCtx  *oto.Context
+	phrases map[string][]byte
 }

 const (
-	playbackCheckIntervalMs = 250
-	playbackSeekOffsetBytes = 1024
+	playbackCheckIntervalMs = 100
 )

-func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
-	flag.BoolVar(ttsFlag, "notts", false, "Disable text-to-speech on mode change.")
-	flag.StringVar(ttsVoiceFlag, "voice", "alan", "Which voice to use for TTS; must be 'alan' or 'jenny'")
+// TODO: make most of this configurable via file
+func addTTSFlags() *TTSOptions {
+	ops := &TTSOptions{}

+	flag.BoolVar(&ops.Disabled, "no-tts", false, "Disable text-to-speech.")
+	flag.StringVar(&ops.Voice, "tts-voice", "en", "Which voice to use for TTS; see 'espeak --voices' for a full list of options.")
+	flag.IntVar(&ops.Volume, "tts-volume", 100, "Text to speech volume")
+	flag.IntVar(&ops.Pitch, "tts-pitch", 50, "Text to speech volume")
+	flag.IntVar(&ops.Range, "tts-range", 50, "Text to speech volume")
+	flag.IntVar(&ops.Range, "tts-speed", 175, "Text to speech speaking speed (in words per minute)")
+
+	return ops
 }

-func newTTS(disable bool, voice string) (*TTS, error) {
-	if disable {
-		return nil, nil
-	}
-
-	dataDir, err := os.MkdirTemp("", "joyful-piper.")
-	if err != nil {
-		return nil, err
-	}
-
-	var ass asset.Asset
-	switch voice {
-	case "jenny":
-		ass = jenny.Asset
-	case "alan":
-		ass = alan.Asset
-	default:
-		ass = alan.Asset
-	}
-
-	pTTS, err := piper.NewEmbedded(dataDir, ass)
-
-	if err != nil {
-		return nil, err
-	}
-
+func makeOtoContext() (*oto.Context, error) {
 	op := &oto.NewContextOptions{
 		SampleRate:   22050,
 		ChannelCount: 1,
@ -72,35 +58,78 @@ func newTTS(disable bool, voice string) (*TTS, error) {
 	}
 	<-readyChan // wait for initialization

+	return otoCtx, nil
+}
+
+func newTTS(ops *TTSOptions) (*TTS, error) {
+	if ops.Disabled {
+		return nil, nil
+	}
+
+	context, err := makeOtoContext()
+	if err != nil {
+		return nil, err
+	}
+
 	return &TTS{
-		TTS:     *pTTS,
-		dataDir: dataDir,
-		otoCtx:  otoCtx,
+		options: ops,
+		otoCtx:  context,
+		phrases: make(map[string][]byte),
 	}, nil
 }

-// "Say" generates TTS audio and plays it in a go routine
-func (t *TTS) Say(msg string) {
-	go func() {
-		wav, err := t.Synthesize(msg)
+func (t *TTS) AddMessage(msg string) {
+	// TODO: need to get lots of input validation in here
+	// We execute `espeak-ng` directly because extant libraries produce terrible output
+	// compared to the command-line utility. This also gives us a chance to
+	cmd := exec.Command(
+		"espeak-ng", "--stdout",
+		"-v", t.options.Voice,
+		"-a", strconv.Itoa(t.options.Volume),
+		"-p", strconv.Itoa(t.options.Pitch),
+		"-P", strconv.Itoa(t.options.Range),
+		"-s", strconv.Itoa(t.options.Speed),
+		msg,
+	)

-		if err != nil {
-			logger.LogError(err, "")
-			return
+	wavData, err := cmd.Output()
+	if err != nil {
+		logger.LogError(err, "Failed to create TTS data")
+		return
+	}
+
+	t.phrases[msg] = wavData
+}
+
+// "Say" generates TTS audio and plays it in a go routine
+func (t *TTS) Say(msg string) error {
+	if _, ok := t.phrases[msg]; !ok {
+		return fmt.Errorf("tried to play non-buffered phrase '%s'", msg)
+	}
+
+	go func(buf []byte) {
+		buffer := bytes.NewBuffer(buf)
+		player := t.otoCtx.NewPlayer(buffer)
+
+		volume := 0.0
+		player.SetVolume(volume)
+		player.Play()
+
+		// Gradually ramp up the volume to avoid harsh clicks
+		for player.Volume() < 1.0 {
+			volume += 0.01
+			if volume > 1.0 {
+				volume = 1.0
+			}
+
+			player.SetVolume(volume)
+			time.Sleep(1 * time.Millisecond)
 		}

-		wavReader := bytes.NewReader(wav)
-		player := t.otoCtx.NewPlayer(wavReader)
-		// We seek some bytes into the generated audio because there's a click
-		// and a long delay at the beginning of the data.
-		player.Seek(playbackSeekOffsetBytes, io.SeekStart)
-		player.Play()
 		for player.IsPlaying() {
 			time.Sleep(playbackCheckIntervalMs * time.Millisecond)
 		}
-	}()
-}
+	}(t.phrases[msg])

-func (t *TTS) Cleanup() {
-	os.RemoveAll(t.dataDir)
+	return nil
 }
--- a/cmd/joyful/tts_stub.go
+++ b/cmd/joyful/tts_stub.go
@ -1,16 +0,0 @@
-//go:build notts
-
-package main
-
-type Speaker interface {
-	Say(string)
-	Cleanup()
-}
-
-func newTTS(_ bool, _ string) (Speaker, error) {
-	return nil, nil
-}
-
-func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
-	return
-}