Implement good-enough TTS without machine learning.

This commit is contained in:
Anna Rose Wiggins 2025-07-29 15:56:44 -04:00
parent 8262d78b54
commit cd43d98fb6
7 changed files with 116 additions and 122 deletions

View file

@ -63,21 +63,18 @@ func initPhysicalDevices(config *config.ConfigParser) map[string]*evdev.InputDev
func main() {
// parse command-line
var configFlag string
var ttsVoiceFlag string
var ttsFlag bool
flag.BoolVarP(&logger.IsDebugMode, "debug", "d", false, "Output very verbose debug messages.")
flag.StringVarP(&configFlag, "config", "c", "~/.config/joyful", "Directory to read configuration from.")
addTTSFlags(&ttsFlag, &ttsVoiceFlag)
ttsOps := addTTSFlags()
flag.Parse()
// parse configs
configDir := getConfigDir(configFlag)
config := readConfig(configDir)
tts, err := newTTS(ttsFlag, ttsVoiceFlag)
// initialize TTS
tts, err := newTTS(ttsOps)
logger.LogIfError(err, "Failed to initialize TTS")
if tts != nil {
defer tts.Cleanup()
}
// Initialize virtual devices with event buffers
vBuffersByName, vBuffersByDevice := initVirtualBuffers(config)
@ -91,6 +88,12 @@ func main() {
// initialize the mode variable
mode := config.GetModes()[0]
// initialize TTS phrases for modes
for _, m := range config.GetModes() {
tts.AddMessage(m)
logger.LogDebugf("Added TTS message '%s'", m)
}
fmt.Println("Joyful Running! Press Ctrl+C to quit. Press Enter to reload rules.")
if len(config.GetModes()) > 1 {
logger.Logf("Initial mode set to '%s'", mode)

View file

@ -1,65 +1,51 @@
//go:build !notts
package main
import (
"bytes"
"io"
"os"
"fmt"
"os/exec"
"strconv"
"time"
"git.annabunches.net/annabunches/joyful/internal/logger"
"github.com/amitybell/piper"
asset "github.com/amitybell/piper-asset"
alan "github.com/amitybell/piper-voice-alan"
jenny "github.com/amitybell/piper-voice-jenny"
"github.com/ebitengine/oto/v3"
flag "github.com/spf13/pflag"
)
type TTSOptions struct {
Disabled bool
Voice string
Volume int
Pitch int
Range int
Speed int
}
type TTS struct {
piper.TTS
dataDir string
options *TTSOptions
otoCtx *oto.Context
phrases map[string][]byte
}
const (
playbackCheckIntervalMs = 250
playbackSeekOffsetBytes = 1024
playbackCheckIntervalMs = 100
)
func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
flag.BoolVar(ttsFlag, "notts", false, "Disable text-to-speech on mode change.")
flag.StringVar(ttsVoiceFlag, "voice", "alan", "Which voice to use for TTS; must be 'alan' or 'jenny'")
// TODO: make most of this configurable via file
func addTTSFlags() *TTSOptions {
ops := &TTSOptions{}
flag.BoolVar(&ops.Disabled, "no-tts", false, "Disable text-to-speech.")
flag.StringVar(&ops.Voice, "tts-voice", "en", "Which voice to use for TTS; see 'espeak --voices' for a full list of options.")
flag.IntVar(&ops.Volume, "tts-volume", 100, "Text to speech volume")
flag.IntVar(&ops.Pitch, "tts-pitch", 50, "Text to speech volume")
flag.IntVar(&ops.Range, "tts-range", 50, "Text to speech volume")
flag.IntVar(&ops.Range, "tts-speed", 175, "Text to speech speaking speed (in words per minute)")
return ops
}
func newTTS(disable bool, voice string) (*TTS, error) {
if disable {
return nil, nil
}
dataDir, err := os.MkdirTemp("", "joyful-piper.")
if err != nil {
return nil, err
}
var ass asset.Asset
switch voice {
case "jenny":
ass = jenny.Asset
case "alan":
ass = alan.Asset
default:
ass = alan.Asset
}
pTTS, err := piper.NewEmbedded(dataDir, ass)
if err != nil {
return nil, err
}
func makeOtoContext() (*oto.Context, error) {
op := &oto.NewContextOptions{
SampleRate: 22050,
ChannelCount: 1,
@ -72,35 +58,78 @@ func newTTS(disable bool, voice string) (*TTS, error) {
}
<-readyChan // wait for initialization
return otoCtx, nil
}
func newTTS(ops *TTSOptions) (*TTS, error) {
if ops.Disabled {
return nil, nil
}
context, err := makeOtoContext()
if err != nil {
return nil, err
}
return &TTS{
TTS: *pTTS,
dataDir: dataDir,
otoCtx: otoCtx,
options: ops,
otoCtx: context,
phrases: make(map[string][]byte),
}, nil
}
// "Say" generates TTS audio and plays it in a go routine
func (t *TTS) Say(msg string) {
go func() {
wav, err := t.Synthesize(msg)
func (t *TTS) AddMessage(msg string) {
// TODO: need to get lots of input validation in here
// We execute `espeak-ng` directly because extant libraries produce terrible output
// compared to the command-line utility. This also gives us a chance to
cmd := exec.Command(
"espeak-ng", "--stdout",
"-v", t.options.Voice,
"-a", strconv.Itoa(t.options.Volume),
"-p", strconv.Itoa(t.options.Pitch),
"-P", strconv.Itoa(t.options.Range),
"-s", strconv.Itoa(t.options.Speed),
msg,
)
if err != nil {
logger.LogError(err, "")
return
wavData, err := cmd.Output()
if err != nil {
logger.LogError(err, "Failed to create TTS data")
return
}
t.phrases[msg] = wavData
}
// "Say" generates TTS audio and plays it in a go routine
func (t *TTS) Say(msg string) error {
if _, ok := t.phrases[msg]; !ok {
return fmt.Errorf("tried to play non-buffered phrase '%s'", msg)
}
go func(buf []byte) {
buffer := bytes.NewBuffer(buf)
player := t.otoCtx.NewPlayer(buffer)
volume := 0.0
player.SetVolume(volume)
player.Play()
// Gradually ramp up the volume to avoid harsh clicks
for player.Volume() < 1.0 {
volume += 0.01
if volume > 1.0 {
volume = 1.0
}
player.SetVolume(volume)
time.Sleep(1 * time.Millisecond)
}
wavReader := bytes.NewReader(wav)
player := t.otoCtx.NewPlayer(wavReader)
// We seek some bytes into the generated audio because there's a click
// and a long delay at the beginning of the data.
player.Seek(playbackSeekOffsetBytes, io.SeekStart)
player.Play()
for player.IsPlaying() {
time.Sleep(playbackCheckIntervalMs * time.Millisecond)
}
}()
}
}(t.phrases[msg])
func (t *TTS) Cleanup() {
os.RemoveAll(t.dataDir)
return nil
}

View file

@ -1,16 +0,0 @@
//go:build notts
package main
type Speaker interface {
Say(string)
Cleanup()
}
func newTTS(_ bool, _ string) (Speaker, error) {
return nil, nil
}
func addTTSFlags(ttsFlag *bool, ttsVoiceFlag *string) {
return
}