Add text-to-speech for announcing mode changes.

2025-07-29 13:01:11 -04:00 · 2025-07-29 13:01:11 -04:00 · 8262d78b54
commit 8262d78b54
parent 3e4367f5e7
8 changed files with 209 additions and 32 deletions
--- a/cmd/joyful/main.go
+++ b/cmd/joyful/main.go
@ -2,23 +2,22 @@ package main

 import (
 	"context"
-	"flag"
 	"fmt"
 	"os"
 	"strings"
 	"sync"

+	"github.com/holoplot/go-evdev"
+	flag "github.com/spf13/pflag"
+
 	"git.annabunches.net/annabunches/joyful/internal/config"
 	"git.annabunches.net/annabunches/joyful/internal/logger"
 	"git.annabunches.net/annabunches/joyful/internal/mappingrules"
 	"git.annabunches.net/annabunches/joyful/internal/virtualdevice"
-	"github.com/holoplot/go-evdev"
 )

-func getConfigDir() string {
-	configFlag := flag.String("config", "~/.config/joyful", "Directory to read configuration from.")
-	flag.Parse()
-	configDir := strings.ReplaceAll(*configFlag, "~", "${HOME}")
+func getConfigDir(dir string) string {
+	configDir := strings.ReplaceAll(dir, "~", "${HOME}")
 	return os.ExpandEnv(configDir)
 }

@ -62,16 +61,31 @@ func initPhysicalDevices(config *config.ConfigParser) map[string]*evdev.InputDev
 }

 func main() {
+	// parse command-line
+	var configFlag string
+	var ttsVoiceFlag string
+	var ttsFlag bool
+	flag.StringVarP(&configFlag, "config", "c", "~/.config/joyful", "Directory to read configuration from.")
+	addTTSFlags(&ttsFlag, &ttsVoiceFlag)
+	flag.Parse()
+
 	// parse configs
-	configDir := getConfigDir()
+	configDir := getConfigDir(configFlag)
 	config := readConfig(configDir)

+	tts, err := newTTS(ttsFlag, ttsVoiceFlag)
+	logger.LogIfError(err, "Failed to initialize TTS")
+	if tts != nil {
+		defer tts.Cleanup()
+	}
+
 	// Initialize virtual devices with event buffers
 	vBuffersByName, vBuffersByDevice := initVirtualBuffers(config)

 	// Initialize physical devices
 	pDevices := initPhysicalDevices(config)

+	// Load the rules
 	rules, eventChannel, cancel, wg := loadRules(config, pDevices, getVirtualDevices(vBuffersByName))

 	// initialize the mode variable
@ -83,6 +97,7 @@ func main() {
 	}

 	for {
+		lastMode := mode
 		// Get an event (blocks if necessary)
 		channelEvent := <-eventChannel

@ -124,6 +139,10 @@ func main() {
 			rules, eventChannel, cancel, wg = loadRules(config, pDevices, getVirtualDevices(vBuffersByName))
 			fmt.Println("Config re-loaded. Only rule changes applied. Device and Mode changes require restart.")
 		}
+
+		if lastMode != mode && tts != nil {
+			tts.Say(mode)
+		}
 	}
 }

--- a/cmd/joyful/tts.go
+++ b/cmd/joyful/tts.go
@ -0,0 +1,106 @@
+//go:build !notts
+
+package main
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"time"
+
+	"git.annabunches.net/annabunches/joyful/internal/logger"
+	"github.com/amitybell/piper"
+	asset "github.com/amitybell/piper-asset"
+	alan "github.com/amitybell/piper-voice-alan"
+	jenny "github.com/amitybell/piper-voice-jenny"
+	"github.com/ebitengine/oto/v3"
+	flag "github.com/spf13/pflag"
+)
+
+type TTS struct {
+	piper.TTS
+	dataDir string
+	otoCtx  *oto.Context
+}
+
+const (
+	playbackCheckIntervalMs = 250
+	playbackSeekOffsetBytes = 1024
+)
+
+func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
+	flag.BoolVar(ttsFlag, "notts", false, "Disable text-to-speech on mode change.")
+	flag.StringVar(ttsVoiceFlag, "voice", "alan", "Which voice to use for TTS; must be 'alan' or 'jenny'")
+
+}
+
+func newTTS(disable bool, voice string) (*TTS, error) {
+	if disable {
+		return nil, nil
+	}
+
+	dataDir, err := os.MkdirTemp("", "joyful-piper.")
+	if err != nil {
+		return nil, err
+	}
+
+	var ass asset.Asset
+	switch voice {
+	case "jenny":
+		ass = jenny.Asset
+	case "alan":
+		ass = alan.Asset
+	default:
+		ass = alan.Asset
+	}
+
+	pTTS, err := piper.NewEmbedded(dataDir, ass)
+
+	if err != nil {
+		return nil, err
+	}
+
+	op := &oto.NewContextOptions{
+		SampleRate:   22050,
+		ChannelCount: 1,
+		Format:       oto.FormatSignedInt16LE,
+	}
+
+	otoCtx, readyChan, err := oto.NewContext(op)
+	if err != nil {
+		return nil, err
+	}
+	<-readyChan // wait for initialization
+
+	return &TTS{
+		TTS:     *pTTS,
+		dataDir: dataDir,
+		otoCtx:  otoCtx,
+	}, nil
+}
+
+// "Say" generates TTS audio and plays it in a go routine
+func (t *TTS) Say(msg string) {
+	go func() {
+		wav, err := t.Synthesize(msg)
+
+		if err != nil {
+			logger.LogError(err, "")
+			return
+		}
+
+		wavReader := bytes.NewReader(wav)
+		player := t.otoCtx.NewPlayer(wavReader)
+		// We seek some bytes into the generated audio because there's a click
+		// and a long delay at the beginning of the data.
+		player.Seek(playbackSeekOffsetBytes, io.SeekStart)
+		player.Play()
+		for player.IsPlaying() {
+			time.Sleep(playbackCheckIntervalMs * time.Millisecond)
+		}
+	}()
+}
+
+func (t *TTS) Cleanup() {
+	os.RemoveAll(t.dataDir)
+}
--- a/cmd/joyful/tts_stub.go
+++ b/cmd/joyful/tts_stub.go
@ -0,0 +1,16 @@
+//go:build notts
+
+package main
+
+type Speaker interface {
+	Say(string)
+	Cleanup()
+}
+
+func newTTS(_ bool, _ string) (Speaker, error) {
+	return nil, nil
+}
+
+func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
+	return
+}