@inproceedings{beck2021wavebender, title={{W}avebender {GAN}: {C}ontrollable speech synthesis for speech sciences}, author={D{\"o}hler Beck, Gustavo Teodoro and Wennberg, Ulme and Henter, Gustav Eje and Malisz, Zofia}, booktitle={Proc. International Conference on Tone and Intonation (TAI)}, abstract={Artificial modeling of human speech has depended on an ongoing dialogue between speech scientists and engineers: speech science helped synthesis get started. Reciprocally, insights into speech sciences, such as evidence for categorical speech perception and speech perception theory were reached with the use of synthetic stimuli. Unfortunately, the fields have grown apart in pursuit of different goals. Speech technology has strived for ever more realistic-sounding synthesis, recently culminating in neural vocoders and sequence-to-sequence systems based on deep learning. The result being that technology is now capable of imitating human speech remarkably well, but with little or no explicit control over the output. Synthesis controlability, i.e., the ability to create and manipulate stimuli with precise control over acoustic cues such as pitch, duration, etc., is central to speech-research goals. Particularly those involving the disentanglement of different types of information in speech and their perceptual and neurophysiological correlates. As speech technology has as of yet been unable to offer adequate control functionality, speech science remains reliant on outdated synthesis methods such as formant-based speech generation (1950s) or acoustic feature editing (e.g., PSOLA, 1990s) that do offer control functionality. However, as these methods generally have low perceptual similarity to natural speech, the field runs the risk of insufficient universality and robustness of the associated research findings. This work aims to push for progress in both speech science and technology by combining synthesis realism and control. We propose Wavebender GAN, a speech-synthesis system capable of bridging this gap. The idea of Wavebender GAN is to use deep learning to predict mel-spectrograms from low-level signal properties alone (e.g., formants, spectral slope, and the f0 contour). A high-quality speech waveform is then synthesized using state-of-the-art neural vocoders such as WaveGlow and HiFi-GAN. This resembles some modern text-to-speech systems with f0 control, but we use low-level signal properties as the only inputs, and no text. At a glance, the process of creating our Wavebender GAN system can be split into four key stages: (1) select uncorrelated low-level signal properties as system inputs, that contain sufficient information to predict natural-sounding speech mel-spectrograms and are also of interest to manipulate independently; (2) perform data augmentation (e.g., pitch and gain manipulation) to allow the model to explore the domain of relevant input and output features more effectively; (3) use the augmented data to train Wavebender Net, a version of the ResNet architecture adapted to predict mel-spectrograms from the selected low-level signal properties; and (4) improve the realism of the predicted mel-spectrograms by enhancing them using a conditional GAN (cGAN). Currently, our Wavebender GAN has been trained on the publicly available, single-speaker LJ Speech dataset. Initial subjective evaluations of output control and quality suggest very good results on both measures. Taken together, Wavebender GAN enables speech scientists to construct end-to-end pipelines for stimulus creation and testing of phonological models. The system provides a technological update to these pipelines in that it generates synthetic speech signals that are controllable as well as correlated with a larger share of natural speech cues.}, address={S{\o}nderborg, Denmark}, month={Dec.}, volume={1}, year={2021} }