Add text-to-speech support. (#13)

Reviewed-on: #13 Co-authored-by: Anna Rose Wiggins <annabunches@gmail.com> Co-committed-by: Anna Rose Wiggins <annabunches@gmail.com>
2025-07-29 19:59:54 +00:00 · 2025-07-29 19:59:54 +00:00 · 9d262977f9
commit 9d262977f9
parent 3e4367f5e7
8 changed files with 203 additions and 32 deletions
--- a/cmd/joyful/main.go
+++ b/cmd/joyful/main.go
@ -2,23 +2,22 @@ package main

 import (
 	"context"
-	"flag"
 	"fmt"
 	"os"
 	"strings"
 	"sync"

+	"github.com/holoplot/go-evdev"
+	flag "github.com/spf13/pflag"
+
 	"git.annabunches.net/annabunches/joyful/internal/config"
 	"git.annabunches.net/annabunches/joyful/internal/logger"
 	"git.annabunches.net/annabunches/joyful/internal/mappingrules"
 	"git.annabunches.net/annabunches/joyful/internal/virtualdevice"
-	"github.com/holoplot/go-evdev"
 )

-func getConfigDir() string {
-	configFlag := flag.String("config", "~/.config/joyful", "Directory to read configuration from.")
-	flag.Parse()
-	configDir := strings.ReplaceAll(*configFlag, "~", "${HOME}")
+func getConfigDir(dir string) string {
+	configDir := strings.ReplaceAll(dir, "~", "${HOME}")
 	return os.ExpandEnv(configDir)
 }

@ -62,27 +61,46 @@ func initPhysicalDevices(config *config.ConfigParser) map[string]*evdev.InputDev
 }

 func main() {
+	// parse command-line
+	var configFlag string
+	flag.BoolVarP(&logger.IsDebugMode, "debug", "d", false, "Output very verbose debug messages.")
+	flag.StringVarP(&configFlag, "config", "c", "~/.config/joyful", "Directory to read configuration from.")
+	ttsOps := addTTSFlags()
+	flag.Parse()
+
 	// parse configs
-	configDir := getConfigDir()
+	configDir := getConfigDir(configFlag)
 	config := readConfig(configDir)

+	// initialize TTS
+	tts, err := newTTS(ttsOps)
+	logger.LogIfError(err, "Failed to initialize TTS")
+
 	// Initialize virtual devices with event buffers
 	vBuffersByName, vBuffersByDevice := initVirtualBuffers(config)

 	// Initialize physical devices
 	pDevices := initPhysicalDevices(config)

+	// Load the rules
 	rules, eventChannel, cancel, wg := loadRules(config, pDevices, getVirtualDevices(vBuffersByName))

 	// initialize the mode variable
 	mode := config.GetModes()[0]

+	// initialize TTS phrases for modes
+	for _, m := range config.GetModes() {
+		tts.AddMessage(m)
+		logger.LogDebugf("Added TTS message '%s'", m)
+	}
+
 	fmt.Println("Joyful Running! Press Ctrl+C to quit. Press Enter to reload rules.")
 	if len(config.GetModes()) > 1 {
 		logger.Logf("Initial mode set to '%s'", mode)
 	}

 	for {
+		lastMode := mode
 		// Get an event (blocks if necessary)
 		channelEvent := <-eventChannel

@ -124,6 +142,10 @@ func main() {
 			rules, eventChannel, cancel, wg = loadRules(config, pDevices, getVirtualDevices(vBuffersByName))
 			fmt.Println("Config re-loaded. Only rule changes applied. Device and Mode changes require restart.")
 		}
+
+		if lastMode != mode && tts != nil {
+			tts.Say(mode)
+		}
 	}
 }

--- a/cmd/joyful/tts.go
+++ b/cmd/joyful/tts.go
@ -0,0 +1,135 @@
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"os/exec"
+	"strconv"
+	"time"
+
+	"git.annabunches.net/annabunches/joyful/internal/logger"
+	"github.com/ebitengine/oto/v3"
+	flag "github.com/spf13/pflag"
+)
+
+type TTSOptions struct {
+	Disabled bool
+	Voice    string
+	Volume   int
+	Pitch    int
+	Range    int
+	Speed    int
+}
+
+type TTS struct {
+	options *TTSOptions
+	otoCtx  *oto.Context
+	phrases map[string][]byte
+}
+
+const (
+	playbackCheckIntervalMs = 100
+)
+
+// TODO: make most of this configurable via file
+func addTTSFlags() *TTSOptions {
+	ops := &TTSOptions{}
+
+	flag.BoolVar(&ops.Disabled, "no-tts", false, "Disable text-to-speech.")
+	flag.StringVar(&ops.Voice, "tts-voice", "en", "Which voice to use for TTS; see 'espeak --voices' for a full list of options.")
+	flag.IntVar(&ops.Volume, "tts-volume", 100, "Text to speech volume")
+	flag.IntVar(&ops.Pitch, "tts-pitch", 50, "Text to speech volume")
+	flag.IntVar(&ops.Range, "tts-range", 50, "Text to speech volume")
+	flag.IntVar(&ops.Range, "tts-speed", 175, "Text to speech speaking speed (in words per minute)")
+
+	return ops
+}
+
+func makeOtoContext() (*oto.Context, error) {
+	op := &oto.NewContextOptions{
+		SampleRate:   22050,
+		ChannelCount: 1,
+		Format:       oto.FormatSignedInt16LE,
+	}
+
+	otoCtx, readyChan, err := oto.NewContext(op)
+	if err != nil {
+		return nil, err
+	}
+	<-readyChan // wait for initialization
+
+	return otoCtx, nil
+}
+
+func newTTS(ops *TTSOptions) (*TTS, error) {
+	if ops.Disabled {
+		return nil, nil
+	}
+
+	context, err := makeOtoContext()
+	if err != nil {
+		return nil, err
+	}
+
+	return &TTS{
+		options: ops,
+		otoCtx:  context,
+		phrases: make(map[string][]byte),
+	}, nil
+}
+
+func (t *TTS) AddMessage(msg string) {
+	// TODO: need to get lots of input validation in here
+	// We execute `espeak-ng` directly because extant libraries produce terrible output
+	// compared to the command-line utility. This also gives us a chance to
+	cmd := exec.Command(
+		"espeak-ng", "--stdout",
+		"-v", t.options.Voice,
+		"-a", strconv.Itoa(t.options.Volume),
+		"-p", strconv.Itoa(t.options.Pitch),
+		"-P", strconv.Itoa(t.options.Range),
+		"-s", strconv.Itoa(t.options.Speed),
+		msg,
+	)
+
+	wavData, err := cmd.Output()
+	if err != nil {
+		logger.LogError(err, "Failed to create TTS data")
+		return
+	}
+
+	t.phrases[msg] = wavData
+}
+
+// "Say" generates TTS audio and plays it in a go routine
+func (t *TTS) Say(msg string) error {
+	if _, ok := t.phrases[msg]; !ok {
+		return fmt.Errorf("tried to play non-buffered phrase '%s'", msg)
+	}
+
+	go func(buf []byte) {
+		buffer := bytes.NewBuffer(buf)
+		player := t.otoCtx.NewPlayer(buffer)
+
+		volume := 0.0
+		player.SetVolume(volume)
+		player.Play()
+
+		// Gradually ramp up the volume to avoid harsh clicks
+		for player.Volume() < 1.0 {
+			volume += 0.01
+			if volume > 1.0 {
+				volume = 1.0
+			}
+
+			player.SetVolume(volume)
+			time.Sleep(1 * time.Millisecond)
+		}
+
+		for player.IsPlaying() {
+			time.Sleep(playbackCheckIntervalMs * time.Millisecond)
+		}
+	}(t.phrases[msg])
+
+	return nil
+}