@inproceedings{kucherenko2021genea, title={{GENEA} {W}orkshop 2021: {T}he 2nd Workshop on Generation and Evaluation of Non-verbal Behaviour for Embodied Agents}, author={Kucherenko, Taras and Jonell, Patrik and Yoon, Youngwoo and Wolfert, Pieter and Yumak, Zerrin and Henter, Gustav Eje}, booktitle={Proc. ICMI}, abstract={Embodied agents benefit from using non-verbal behavior when communicating with humans. Despite several decades of non-verbal behavior-generation research, there is currently no well-developed benchmarking culture in the field. For example, most researchers do not compare their outcomes with previous work, and if they do, they often do so in their own way which frequently is incompatible with others. With the GENEA Workshop 2021, we aim to bring the community together to discuss key challenges and solutions, and find the most appropriate ways to move the field forward.}, keywords={behavior synthesis, gesture generation, datasets, evaluation}, address={Montr{\'e}al, QC}, month={Oct.}, publisher={ACM}, volume={23}, pages={872--873}, doi={10.1145/3462244.3480983}, year={2021} } @article{kucherenko2021multimodal, title={Multimodal analysis of the predictability of hand-gesture properties}, author={Kucherenko, Taras and Nagy, Rajmund and Neff, Michael and Kjellstr{\"o}m, Hedvig and Henter, Gustav Eje}, journal={arXiv preprint arXiv:2108.05762}, abstract={Embodied conversational agents benefit from being able to accompany their speech with gestures. Although many data-driven approaches to gesture generation have been proposed in recent years, it is still unclear whether such systems can consistently generate gestures that convey meaning. We investigate which gesture properties (phase, category, and semantics) can be predicted from speech text and/or audio using contemporary deep learning. In extensive experiments, we show that gesture properties related to gesture meaning (semantics and category) are predictable from text features (time-aligned BERT embeddings) alone, but not from prosodic audio features, while rhythm-related gesture properties (phase) on the other hand can be predicted from either audio, text (with word-level timing information), or both. These results are encouraging as they indicate that it is possible to equip an embodied agent with content-wise meaningful co-speech gestures using a machine-learning model.}, keywords={non-verbal behavior, animation, gesture generation, virtual agents, iconic gestures}, month={Aug.}, year={2021} } @article{mehta2021neural, title={Neural {HMM}s are all you need (for high-quality attention-free {TTS})}, author={Mehta, Shivam and Sz{\'e}kely, {\'E}va and Beskow, Jonas and Henter, Gustav Eje}, journal={arXiv preprint arXiv:2108.13320}, abstract={Neural sequence-to-sequence TTS has achieved significantly better output quality than statistical speech synthesis using HMMs. However, neural TTS is generally not probabilistic and the use of non-monotonic attention both increases training time and introduces "babbling" failure modes that are unacceptable in production. This paper demonstrates that the old and new paradigms can be combined to obtain the advantages of both worlds, by replacing the attention in Tacotron 2 with an autoregressive left-right no-skip hidden Markov model defined by a neural network. This leads to an HMM-based neural TTS model with monotonic alignment, trained to maximise the full sequence likelihood without approximations. We discuss how to combine innovations from both classical and contemporary TTS for best results. The final system is smaller and simpler than Tacotron 2, and learns to speak with fewer iterations and less data, whilst achieving the same naturalness prior to the post-net. Unlike Tacotron 2, our system also allows easy control over speaking rate.}, keywords={seq2seq, attention, HMMs, duration modelling, acoustic modelling}, month={Sept.}, year={2021} } @inproceedings{beck2021wavebender, title={{W}avebender {GAN}: {C}ontrollable speech synthesis for speech sciences}, author={D{\"o}hler Beck, Gustavo Teodoro and Wennberg, Ulme and Henter, Gustav Eje and Malisz, Zofia}, booktitle={Proc. International Conference on Tone and Intonation (TAI)}, abstract={Artificial modeling of human speech has depended on an ongoing dialogue between speech scientists and engineers: speech science helped synthesis get started. Reciprocally, insights into speech sciences, such as evidence for categorical speech perception and speech perception theory were reached with the use of synthetic stimuli. Unfortunately, the fields have grown apart in pursuit of different goals. Speech technology has strived for ever more realistic-sounding synthesis, recently culminating in neural vocoders and sequence-to-sequence systems based on deep learning. The result being that technology is now capable of imitating human speech remarkably well, but with little or no explicit control over the output. Synthesis controlability, i.e., the ability to create and manipulate stimuli with precise control over acoustic cues such as pitch, duration, etc., is central to speech-research goals. Particularly those involving the disentanglement of different types of information in speech and their perceptual and neurophysiological correlates. As speech technology has as of yet been unable to offer adequate control functionality, speech science remains reliant on outdated synthesis methods such as formant-based speech generation (1950s) or acoustic feature editing (e.g., PSOLA, 1990s) that do offer control functionality. However, as these methods generally have low perceptual similarity to natural speech, the field runs the risk of insufficient universality and robustness of the associated research findings. This work aims to push for progress in both speech science and technology by combining synthesis realism and control. We propose Wavebender GAN, a speech-synthesis system capable of bridging this gap. The idea of Wavebender GAN is to use deep learning to predict mel-spectrograms from low-level signal properties alone (e.g., formants, spectral slope, and the f0 contour). A high-quality speech waveform is then synthesized using state-of-the-art neural vocoders such as WaveGlow and HiFi-GAN. This resembles some modern text-to-speech systems with f0 control, but we use low-level signal properties as the only inputs, and no text. At a glance, the process of creating our Wavebender GAN system can be split into four key stages: (1) select uncorrelated low-level signal properties as system inputs, that contain sufficient information to predict natural-sounding speech mel-spectrograms and are also of interest to manipulate independently; (2) perform data augmentation (e.g., pitch and gain manipulation) to allow the model to explore the domain of relevant input and output features more effectively; (3) use the augmented data to train Wavebender Net, a version of the ResNet architecture adapted to predict mel-spectrograms from the selected low-level signal properties; and (4) improve the realism of the predicted mel-spectrograms by enhancing them using a conditional GAN (cGAN). Currently, our Wavebender GAN has been trained on the publicly available, single-speaker LJ Speech dataset. Initial subjective evaluations of output control and quality suggest very good results on both measures. Taken together, Wavebender GAN enables speech scientists to construct end-to-end pipelines for stimulus creation and testing of phonological models. The system provides a technological update to these pipelines in that it generates synthetic speech signals that are controllable as well as correlated with a larger share of natural speech cues.}, address={S{\o}nderborg, Denmark}, month={Dec.}, volume={1}, year={2021} } @inproceedings{sorkhei2021full, title={{F}ull-{G}low: {F}ully conditional {G}low for more realistic image generation}, author={Sorkhei, Moein and Henter, Gustav Eje and Kjellstr{\"o}m, Hedvig}, booktitle={Proc. GCPR}, abstract={Autonomous agents, such as driverless cars, require large amounts of labeled visual data for their training. A viable approach for acquiring such data is training a generative model with collected real data, and then augmenting the collected real dataset with synthetic images from the model, generated with control of the scene layout and ground truth labeling. In this paper we propose Full-Glow, a fully conditional Glow-based architecture for generating plausible and realistic images of novel street scenes given a semantic segmentation map indicating the scene layout. Benchmark comparisons show our model to outperform recent works in terms of the semantic segmentation performance of a pretrained PSPNet. This indicates that images from our model are, to a higher degree than from other models, similar to real images of the same kinds of scenes and objects, making them suitable as training data for a visual semantic segmentation or object recognition system.}, keywords={conditional image generation, generative models, normalizing flows}, address={Bonn, Germany}, month={Oct.}, publisher={DAGM}, volume={43}, year={2021} } @article{ghosh2021normalizing, title={Normalizing Flow based Hidden {M}arkov Models for Classification of Speech Phones with Explainability}, author={Ghosh, Anubhab and Honor{\'e}, Antoine and Liu, Dong and Henter, Gustav Eje and Chatterjee, Saikat}, journal={arXiv preprint arXiv:2107.00730}, abstract={In pursuit of explainability, we develop generative models for sequential data. The proposed models provide state-of-the-art classification results and robust performance for speech phone classification. We combine modern neural networks (normalizing flows) and traditional generative models (hidden Markov models -- HMMs). Normalizing flow-based mixture models (NMMs) are used to model the conditional probability distribution given the hidden state in the HMMs. Model parameters are learned through judicious combinations of time-tested Bayesian learning methods and contemporary neural network learning methods. We mainly combine expectation-maximization (EM) and mini-batch gradient descent. The proposed generative models can compute likelihood of a data and hence directly suitable for maximum-likelihood (ML) classification approach. Due to structural flexibility of HMMs, we can use different normalizing flow models. This leads to different types of HMMs providing diversity in data modeling capacity. The diversity provides an opportunity for easy decision fusion from different models. For a standard speech phone classification setup involving 39 phones (classes) and the TIMIT dataset, we show that the use of standard features called mel-frequency-cepstral-coeffcients (MFCCs), the proposed generative models, and the decision fusion together can achieve 86.6\% accuracy by generative training only. This result is close to state-of-the-art results, for examples, 86.2\% accuracy of PyTorch-Kaldi toolkit, and 85.1\% accuracy using light gated recurrent units. We do not use any discriminative learning approach and related sophisticated features in this article.}, keywords={phone recognition, generative models, hidden Markov models, neural networks}, month={July}, year={2021} } @article{valle2021transflower, title={{T}ransflower: {P}robabilistic autoregressive dance generation with multimodal attention}, author={Valle-P{\'e}rez, Guillermo and Henter, Gustav Eje and Beskow, Jonas and Holzapfel, Andre and Oudeyer, Pierre-Yves and Alexanderson, Simon}, journal={ACM Trans. Graph.}, abstract={Dance requires skillful composition of complex movements that follow rhythmic, tonal and timbral features of music. Formally, generating dance conditioned on a piece of music can be expressed as a problem of modelling a high-dimensional continuous motion signal, conditioned on an audio signal. In this work we make two contributions to tackle this problem. First, we present a novel probabilistic autoregressive architecture that models the distribution over future poses with a normalizing flow conditioned on previous poses as well as music context, using a multimodal transformer encoder. Second, we introduce the currently largest 3D dance-motion dataset, obtained with a variety of motion-capture technologies, and including both professional and casual dancers. Using this dataset, we compare our new model against two baselines, via objective metrics and a user study, and show that both the ability to model a probability distribution, as well as being able to attend over a large motion and music context are necessary to produce interesting, diverse, and realistic dance that matches the music.}, keywords={generative models, machine learning, normalising flows, Glow, transformers, dance}, month={Dec.}, publisher={ACM}, volume={40}, number={6}, pages={1:1--1:13}, doi={10.1145/3478513.3480570}, year={2021} } @article{haakansson2020robot, title={Robot-assisted detection of subclinical dementia: {P}rogress report and preliminary findings.}, author={H{\aa}kansson, Krister and Beskow, Jonas and Kjellstr{\"o}m, Hedvig and Gustafsson, Joakim and Bonnard, Alexandre and Ryd{\'e}n, Marie and Stormoen, Sara and Hagman, G{\"o}ran and Akenine, Ulrika and Morales P{\'e}rez, Kristal and Henter, Gustav Eje and Sundell, Maria and Kivipelto, Miia}, journal={Alzheimer's \& Dementia}, abstract={Background: Earlier identification of an underlying AD pathology could increase chances that preventive or curative treatment will be more successful. Human limitations in sensory capacity, attention and parallel processing could mean that automatic and simultaneous registration from several information channels in combination with artificial intelligence processing could advance diagnostic precision and accuracy. Here we report the progress of an interdisciplinary project to develop robot-assisted diagnostics to detect early neurocognitive disorder. Method: Behavior in 100 patients without previous evidence of cognitive disorder is recorded during their first clinical examination visit. We record gaze, pupil dilations, skin temperature changes, speech characteristics, mimics, pulse, heart rate variability, and motor activity (see image 1). We then use machine learning to identify behavior signals and signal patterns that can distinguish between persons with and without an underlying neuropathology. In addition, we perform human analysis of recorded video and audio on each patient. Over 40 behavior categories are carefully checked, among them repetitions, irrelevant deviations, hesitations, degree of problem awareness, restlessness and inadequate word usage. This analysis is performed by a semi-blinded interdisciplinary expert group of clinically experienced physicians, neurologists, speech therapists, psychologists, nurses, and experts in automatic verbal and non-verbal behavior analysis. The results from this study will be implemented to further develop a social robot platform (Furhat, see image 2) with adequate sensors and algorithms so that it can interact with patients and assist in the diagnostic process. Result: Approximately 25 patients have so far been recorded and analyzed. Preliminary results are that the diagnostic prediction by the expert group, solely based on observation of patient behavior from audio and video, fully has coincided with the diagnostic outcome from the subsequent clinical evaluation. Another observation has been the amount of clinically relevant behavior revealed through careful human behavior observation that the interviewing physician did not observe during the actual interview. Conclusion: Artificial intelligence could represent an untapped potential for improving sensitivity and accuracy in diagnostic procedures to detect early neurocognitive disorders.}, month={Dec.}, publisher={Alzheimer's Association}, volume={16}, number={S6}, pages={e043311}, doi={10.1002/alz.043311}, year={2020} } @inproceedings{kucherenko2020genea, title={The {GENEA} {C}hallenge 2020: {B}enchmarking gesture-generation systems on common data}, author={Kucherenko, Taras and Jonell, Patrik and Yoon, Youngwoo and Wolfert, Pieter and Henter, Gustav Eje}, booktitle={Proc. GENEA Workshop}, abstract={Automatic gesture generation is a field of growing interest, and a key technology for enabling embodied conversational agents. Research into gesture generation is rapidly gravitating towards data-driven methods. Unfortunately, individual research efforts in the field are difficult to compare: there are no established benchmarks, and each study tends to use its own dataset, motion visualisation, and evaluation methodology. To address this situation, we launched the GENEA gesture-generation challenge, wherein participating teams built automatic gesture-generation systems on a common dataset, and the resulting systems were evaluated in parallel in a large, crowdsourced user study. Since differences in evaluation outcomes between systems now are solely attributable to differences between the motion-generation methods, this enables benchmarking recent approaches against one another and investigating the state of the art in the field. This paper provides a first report on the purpose, design, and results of our challenge, with each individual team's entry described in a separate paper also presented at the GENEA Workshop. Additional information about the workshop can be found at genea-workshop.github.io/2020/.}, keywords={gesture generation, conversational agents, evaluation paradigms}, address={Glasgow, UK}, month={Oct.}, publisher={Zenodo}, volume={1}, doi={10.5281/zenodo.4094697}, year={2020} } @inproceedings{ghosh2020robust, title={Robust classification using hidden {M}arkov models and mixtures of normalizing flows}, author={Ghosh, Anubhab and Honor{\'e}, Antoine and Liu, Dong and Henter, Gustav Eje and Chatterjee, Saikat}, booktitle={Proc. MLSP}, abstract={We test the robustness of a maximum-likelihood (ML) based classifier where sequential data as observation is corrupted by noise. The hypothesis is that a generative model, that combines the state transitions of a hidden Markov model (HMM) and the neural network based probability distributions for the hidden states of the HMM, can provide a robust classification performance. The combined model is called normalizing-flow mixture model based HMM (NMM-HMM). It can be trained using a combination of expectation-maximization (EM) and backpropagation. We verify the improved robustness of NMM-HMM classifiers in an application to speech recognition.}, keywords={speech recognition, generative models, hidden Markov models, neural networks}, address={Espoo, Finland}, month={Sept.}, publisher={IEEE}, volume={30}, pages={1--6}, doi={10.1109/MLSP49062.2020.9231775}, year={2020} } @inproceedings{alexanderson2020generating, title={Generating Coherent Spontaneous Speech and Gesture from Text}, author={Alexanderson, Simon and Sz\'{e}kely, \'{E}va and Henter, Gustav Eje and Kucherenko, Taras and Beskow, Jonas}, booktitle={Proc. IVA}, abstract={Embodied human communication encompasses both verbal (speech) and non-verbal information (e.g., gesture and head movements). Recent advances in machine learning have substantially improved the technologies for generating synthetic versions of both of these types of data: On the speech side, text-to-speech systems are now able to generate highly convincing, spontaneous-sounding speech using unscripted speech audio as the source material. On the motion side, probabilistic motion-generation methods can now synthesise vivid and lifelike speech-driven 3D gesticulation. In this paper, we put these two state-of-the-art technologies together in a coherent fashion for the first time. Concretely, we demonstrate a proof-of-concept system trained on a single-speaker audio and motion-capture dataset, that is able to generate both speech and full-body gestures together from text input. In contrast to previous approaches for joint speech-and-gesture generation, we generate full-body gestures from speech synthesis trained on recordings of spontaneous speech from the same person as the motion-capture data. We illustrate our results by visualising gesture spaces and text-speech-gesture alignments, and through a demonstration video.}, keywords={gesture synthesis, text-to-speech, neural networks}, address={Glasgow, UK}, month={Oct.}, publisher={ACM}, volume={20}, pages={1:1--1:3}, doi={10.1145/3383652.3423874}, year={2020} } @inproceedings{kucherenko2021large, title={A Large, Crowdsourced Evaluation of Gesture Generation Systems on Common Data: {T}he {GENEA} {C}hallenge 2020}, author={Kucherenko, Taras and Jonell, Patrik and Yoon, Youngwoo and Wolfert, Pieter and Henter, Gustav Eje}, booktitle={Proc. IUI}, abstract={Co-speech gestures, gestures that accompany speech, play an important role in human communication. Automatic co-speech gesture generation is thus a key enabling technology for embodied conversational agents (ECAs), since humans expect ECAs to be capable of multi-modal communication. Research into gesture generation is rapidly gravitating towards data-driven methods. Unfortunately, individual research efforts in the field are difficult to compare: there are no established benchmarks, and each study tends to use its own dataset, motion visualisation, and evaluation methodology. To address this situation, we launched the GENEA Challenge, a gesture-generation challenge wherein participating teams built automatic gesture-generation systems on a common dataset, and the resulting systems were evaluated in parallel in a large, crowdsourced user study using the same motion-rendering pipeline. Since differences in evaluation outcomes between systems now are solely attributable to differences between the motion-generation methods, this enables benchmarking recent approaches against one another in order to get a better impression of the state of the art in the field. This paper reports on the purpose, design, results, and implications of our challenge.}, keywords={evaluation paradigms, conversational agents, gesture generation}, address={College Station, TX}, month={Apr.}, publisher={ACM}, volume={26}, pages={11--21}, doi={10.1145/3397481.3450692}, year={2021} } @inproceedings{wennberg2021case, title={The Case for Translation-Invariant Self-Attention in Transformer-Based Language Models}, author={Wennberg, Ulme and Henter, Gustav Eje}, booktitle={Proc. ACL-IJCNLP}, abstract={Mechanisms for encoding positional information are central for transformer-based language models. In this paper, we analyze the position embeddings of existing language models, finding strong evidence of translation invariance, both for the embeddings themselves and for their effect on self-attention. The degree of translation invariance increases during training and correlates positively with model performance. Our findings lead us to propose translation-invariant self-attention (TISA), which accounts for the relative position between tokens in an interpretable fashion without needing conventional position embeddings. Our proposal has several theoretical advantages over existing position-representation approaches. Experiments show that it improves on regular ALBERT on GLUE tasks, while only adding orders of magnitude less positional parameters.}, address={Online}, month={Aug.}, publisher={ACL}, volume={59}, pages={130--140}, doi={10.18653/v1/2021.acl-short.18}, year={2021} } @inproceedings{kucherenko2021speech2properties2gestures, title={Speech\-2\-{P}roperties\-2\-{G}estures: {G}esture-Property Prediction as a Tool for Generating Representational Gestures from Speech}, author={Kucherenko, Taras and Nagy, Rajmund and Jonell, Patrik and Neff, Michael and Kjellstr{\"o}m, Hedvig and Henter, Gustav Eje}, booktitle={Proc. IVA}, abstract={We propose a new framework for gesture generation, aiming to allow data-driven approaches to produce more semantically rich gestures. Our approach first predicts whether to gesture, followed by a prediction of the gesture properties. Those properties are then used as conditioning for a modern probabilistic gesture-generation model capable of high-quality output. This empowers the approach to generate gestures that are both diverse and representational. Follow-ups and more information can be found on the project page: https://svito-zar.github.io/speech2properties2gestures/}, keywords={gesture generation, virtual agents, representational gestures}, address={Kyoto, Japan}, month={Sept.}, publisher={ACM}, volume={21}, pages={145--147}, doi={10.1145/3472306.3478333}, year={2021} } @inproceedings{wang2021integrated, title={Integrated Speech and Gesture Synthesis}, author={Wang, Siyang and Alexanderson, Simon and Gustafson, Joakim and Beskow, Jonas and Henter, Gustav Eje and Sz{\'e}kely, {\'E}va}, booktitle={Proc. ICMI}, abstract={Text-to-speech and co-speech gesture synthesis have until now been treated as separate areas by two different research communities, and applications merely stack the two technologies using a simple system-level pipeline. This can lead to modeling inefficiencies and may introduce inconsistencies that limit the achievable naturalness. We propose to instead synthesize the two modalities in a single model, a new problem we call integrated speech and gesture synthesis (ISG). We also propose a set of models modified from state-of-the-art neural speech-synthesis engines to achieve this goal. We evaluate the models in three carefully-designed user studies, two of which evaluate the synthesized speech and gesture in isolation, plus a combined study that evaluates the models like they will be used in real-world applications -- speech and gesture presented together. The results show that participants rate one of the proposed integrated synthesis models as being as good as the state-of-the-art pipeline system we compare against, in all three tests. The model is able to achieve this with faster synthesis time and greatly reduced parameter count compared to the pipeline system, illustrating some of the potential benefits of treating speech and gesture synthesis together as a single, unified problem.}, keywords={neural networks, speech synthesis, gesture generation}, address={Montr{\'e}al, QC}, month={Oct.}, publisher={ACM}, volume={23}, pages={177--185}, doi={10.1145/3462244.3479914}, year={2021} } @inproceedings{jonell2021hemvip, title={{HEMVIP}: {H}uman Evaluation of Multiple Videos in Parallel}, author={Jonell, Patrik and Yoon, Youngwoo and Wolfert, Pieter and Kucherenko, Taras and Henter, Gustav Eje}, booktitle={Proc. ICMI}, abstract={In many research areas, for example motion and gesture generation, objective measures alone do not provide an accurate impression of key stimulus traits such as perceived quality or appropriateness. The gold standard is instead to evaluate these aspects through user studies, especially subjective evaluations of video stimuli. Common evaluation paradigms either present individual stimuli to be scored on Likert-type scales, or ask users to compare and rate videos in a pairwise fashion. However, the time and resources required for such evaluations scale poorly as the number of conditions to be compared increases. Building on standards used for evaluating the quality of multimedia codecs, this paper instead introduces a framework for granular rating of multiple comparable videos in parallel. This methodology essentially analyses all condition pairs at once. Our contributions are 1) a proposed framework, called HEMVIP, for parallel and granular evaluation of multiple video stimuli and 2) a validation study confirming that results obtained using the tool are in close agreement with results of prior studies using conventional multiple pairwise comparisons.}, keywords={evaluation paradigms, video evaluation, conversational agents, gesture generation}, address={Montr{\'e}al, QC}, month={Oct.}, publisher={ACM}, volume={23}, pages={707--711}, doi={10.1145/3462244.3479957}, year={2021} } @article{jonell2021multimodal, title={Multimodal Capture of Patient Behaviour for Improved Detection of Early Dementia: {C}linical Feasibility and Preliminary Results}, author={Jonell, Patrik and Mo{\"e}ll, Birger and H{\aa}kansson, Krister and Henter, Gustav Eje and Kucherenko, Taras and Mikheeva, Olga and Hagman, G{\"o}ran and Holleman, Jasper and Kivipelto, Miia and Kjellstr{\"o}m, Hedvig and others}, journal={Frontiers in Computer Science}, abstract={Non-invasive automatic screening for Alzheimer's disease has the potential to improve diagnostic accuracy while lowering healthcare costs. Previous research has shown that patterns in speech, language, gaze, and drawing can help detect early signs of cognitive decline. In this paper, we describe a highly multimodal system for unobtrusively capturing data during real clinical interviews conducted as part of cognitive assessments for Alzheimer's disease. The system uses nine different sensor devices (smartphones, a tablet, an eye tracker, a microphone array, and a wristband) to record interaction data during a specialist's first clinical interview with a patient, and is currently in use at Karolinska University Hospital in Stockholm, Sweden. Furthermore, complementary information in the form of brain imaging, psychological tests, speech therapist assessment, and clinical meta-data is also available for each patient. We detail our data-collection and analysis procedure and present preliminary findings that relate measures extracted from the multimodal recordings to clinical assessments and established biomarkers, based on data from 25 patients gathered thus far. Our findings demonstrate feasibility for our proposed methodology and indicate that the collected data can be used to improve clinical assessments of early dementia.}, keywords={Alzheimer, MCI, multimodal prediction, speech, gaze, pupil dilation, pen motion, thermal camera}, month={Apr.}, publisher={Frontiers}, volume={3}, number={642633}, pages={1--22}, doi={10.3389/fcomp.2021.642633}, year={2021} } @article{henter2020moglow, title={{M}o{G}low: {P}robabilistic and controllable motion synthesis using normalising flows}, author={Henter, Gustav Eje and Alexanderson, Simon and Beskow, Jonas}, journal={ACM Trans. Graph.}, abstract={Data-driven modelling and synthesis of motion is an active research area with applications that include animation, games, and social robotics. This paper introduces a new class of probabilistic, generative, and controllable motion-data models based on normalising flows. Models of this kind can describe highly complex distributions, yet can be trained efficiently using exact maximum likelihood, unlike GANs or VAEs. Our proposed model is autoregressive and uses LSTMs to enable arbitrarily long time-dependencies. Importantly, is is also causal, meaning that each pose in the output sequence is generated without access to poses or control inputs from future time steps; this absence of algorithmic latency is important for interactive applications with real-time motion control. The approach can in principle be applied to any type of motion since it does not make restrictive, task-specific assumptions regarding the motion or the character morphology. We evaluate the models on motion-capture datasets of human and quadruped locomotion. Objective and subjective results show that randomly-sampled motion from the proposed method outperforms task-agnostic baselines and attains a motion quality close to recorded motion capture.}, keywords={generative models, machine learning, normalising flows, Glow, footstep analysis, data dropout}, month={Nov.}, publisher={ACM}, volume={39}, number={6}, pages={236:1--236:14}, doi={10.1145/3414685.3417836}, year={2020} } @inproceedings{kucherenko2020gesticulator, title={Gesticulator: {A} framework for semantically-aware speech-driven gesture generation}, author={Kucherenko, Taras and Jonell, Patrik and van Waveren, Sanne and Henter, Gustav Eje and Alexanderson, Simon and Leite, Iolanda and Kjellstr{\"o}m, Hedvig}, booktitle={Proc. ICMI}, abstract={During speech, people spontaneously gesticulate, which plays a key role in conveying information. Similarly, realistic co-speech gestures are crucial to enable natural and smooth interactions with social agents. Current data-driven co-speech gesture generation systems use a single modality for representing speech: either audio or text. These systems are therefore confined to producing either acoustically-linked beat gestures or semantically-linked gesticulation (e.g., raising a hand when saying ``high''): they cannot appropriately learn to generate both gesture types. We present a model designed to produce arbitrary beat and semantic gestures together. Our deep-learning based model takes both acoustic and semantic representations of speech as input, and generates gestures as a sequence of joint angle rotations as output. The resulting gestures can be applied to both virtual agents and humanoid robots. Subjective and objective evaluations confirm the success of our approach. The code is publicly available at \href{https://github.com/svito-zar/gesticulator/}{github.com/svito-zar/gesticulator}.}, keywords={gesture generation, virtual agents, socially intelligent systems, co-speech gestures, multi-modal interaction, deep learning}, address={Utrecht, The Netherlands}, month={Oct.}, publisher={ACM}, volume={22}, pages={242--250}, doi={10.1145/3382507.3418815}, year={2020} } @inproceedings{jonell2020let, title={Let's Face It: {P}robabilistic Probabilistic Multi-modal Interlocutor-aware Generation of Facial Gestures in Dyadic Settings}, author={Jonell, Patrik and Kucherenko, Taras and Henter, Gustav Eje and Beskow, Jonas}, booktitle={Proc. IVA}, abstract={To enable more natural face-to-face interactions, conversational agents need to adapt their behavior to their interlocutors. One key aspect of this is generation of appropriate non-verbal behavior for the agent, for example facial gestures, here defined as facial expressions and head movements. Most existing gesture-generating systems do not utilize multi-modal cues from the interlocutor when synthesizing non-verbal behavior. Those that do, typically use deterministic methods that risk producing repetitive and non-vivid motions. In this paper, we introduce a probabilistic method to synthesize interlocutor-aware facial gestures -- represented by highly expressive FLAME parameters -- in dyadic conversations. Our contributions are: a) a method for feature extraction from multi-party video and speech recordings, resulting in a representation that allows for independent control and manipulation of expression and speech articulation in a 3D avatar; b) an extension to MoGlow, a recent motion-synthesis method based on normalizing flows, to also take multi-modal signals from the interlocutor as input and subsequently output interlocutor-aware facial gestures; and c) a subjective evaluation assessing the use and relative importance of the different modalities in the synthesized output. The results show that the model successfully leverages the input from the interlocutor to generate more appropriate behavior. Videos, data, and code are available at: https://jonepatr.github.io/lets\_face\_it}, address={Glasgow, UK}, month={Oct.}, publisher={ACM}, volume={20}, pages={31:1--31:8}, doi={10.1145/3383652.3423911}, year={2020} } @article{kucherenko2021moving, title={Moving fast and slow: {A}nalysis of representations and post-processing in speech-driven automatic gesture generation}, author={Kucherenko, Taras and Hasegawa, Dai and Kaneko, Naoshi and Henter, Gustav Eje and Kjellstr{\"o}m, Hedvig}, journal={International Journal of Human-Computer Interaction}, abstract={This paper presents a novel framework for speech-driven gesture production, applicable to virtual agents to enhance human-computer interaction. Specifically, we extend recent deep-learning-based, data-driven methods for speech-driven gesture generation by incorporating representation learning. Our model takes speech as input and produces gestures as output, in the form of a sequence of 3D coordinates. We provide an analysis of different representations for the input (speech) and the output (motion) of the network by both objective and subjective evaluations. We also analyze the importance of smoothing of the produced motion. Our results indicated that the proposed method improved on our baseline in terms of objective measures. For example, it better captured the motion dynamics and better matched the motion-speed distribution. Moreover, we performed user studies on two different datasets. The studies confirmed that our proposed method is perceived as more natural than the baseline, although the difference in the studies was eliminated by appropriate post-processing: hip-centering and smoothing. We conclude that it is important to take both motion representation and post-processing into account when designing an automatic gesture-production method.}, keywords={Gesture generation, representation learning, neural network, deep learning, virtual agents, non-verbal behavior}, month={Feb.}, publisher={Taylor \& Francis}, volume={37}, number={14}, pages={1300--1316}, doi={10.1080/10447318.2021.1883883}, year={2021} } @inproceedings{szekely2020breathing, title={Breathing and speech planning in spontaneous speech synthesis}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. ICASSP}, abstract={Breathing and speech planning in spontaneous speech are coordinated processes, often exhibiting disfluent patterns. While synthetic speech is not subject to respiratory needs, integrating breath into synthesis has advantages for naturalness and recall. At the same time, a synthetic voice reproducing disfluent breathing patterns learned from the data can be problematic. To address this, we first propose training stochastic TTS on a corpus of overlapping breath-group bigrams, to take context into account. Next, we introduce an unsupervised automatic annotation of likely-disfluent breath events, through a product-of-experts model that combines the output of two breath-event predictors, each using complementary information and operating in opposite directions. This annotation enables creating an automatically-breathing spontaneous speech synthesiser with a more fluent breathing style. A subjective evaluation on two spoken genres (impromptu and rehearsed) found the proposed system to be preferred over the baseline approach treating all breath events the same.}, keywords={Speech synthesis, spontaneous speech, breathing, speech planning, ensemble method}, address={Virtual conference}, month={May}, publisher={IEEE}, volume={45}, pages={7649--7653}, doi={10.1109/ICASSP40776.2020.9054107}, year={2020} } @inproceedings{alexanderson2020robust, title={Robust model training and generalisation with {S}tudentising flows}, author={Alexanderson, Simon and Henter, Gustav Eje}, booktitle={Proc. INNF+}, abstract={Normalising flows are tractable probabilistic models that leverage the power of deep learning to describe a wide parametric family of distributions, all while remaining trainable using maximum likelihood. We discuss how these methods can be further improved based on insights from robust (in particular, resistant) statistics. Specifically, we propose to endow flow-based models with fat-tailed latent distributions such as multivariate Student's t, as a simple drop-in replacement for the Gaussian distribution used by conventional normalising flows. While robustness brings many advantages, this paper explores two of them: 1) We describe how using fatter-tailed base distributions can give benefits similar to gradient clipping, but without compromising the asymptotic consistency of the method. 2) We also discuss how robust ideas lead to models with reduced generalisation gap and improved held-out data likelihood. Experiments on several different datasets confirm the efficacy of the proposed approach in both regards.}, keywords={Statistical robustness, normalising flows, Student's t-distribution, outliers, resistant statistics, generalisation}, address={Virtual workshop}, month={July}, volume={2}, pages={25:1--25:9}, url={https://invertibleworkshop.github.io/INNF\_2020/accepted\_papers/pdfs/25.pdf}, year={2020} } @article{alexanderson2020style, title={Style-controllable speech-driven gesture synthesis using normalising flows}, author={Alexanderson, Simon and Henter, Gustav Eje and Kucherenko, Taras and Beskow, Jonas}, journal={Computer Graphics Forum}, abstract={Automatic synthesis of realistic gestures promises to transform the fields of animation, avatars and communicative agents. In off-line applications, novel tools can alter the role of an animator to that of a director, who provides only high-level input for the desired animation; a learned network then translates these instructions into an appropriate sequence of body poses. In interactive scenarios, systems for generating natural animations on the fly are key to achieving believable and relatable characters. In this paper we address some of the core issues towards these ends. By adapting a deep learning-based motion synthesis method called MoGlow, we propose a new generative model for generating state-of-the-art realistic speech-driven gesticulation. Owing to the probabilistic nature of the approach, our model can produce a battery of different, yet plausible, gestures given the same input speech signal. Just like humans, this gives a rich natural variation of motion. We additionally demonstrate the ability to exert directorial control over the output style, such as gesture level, speed, symmetry and spacial extent. Such control can be leveraged to convey a desired character personality or mood. We achieve all this without any manual annotation of the data. User studies evaluating upper-body gesticulation confirm that the generated motions are natural and well match the input speech. Our method scores above all prior systems and baselines on these measures, and comes close to the ratings of the original recorded motions. We furthermore find that we can accurately control gesticulation styles without unnecessarily compromising perceived naturalness. Finally, we also demonstrate an application of the same method to full-body gesticulation, including the synthesis of stepping motion and stance.}, keywords={motion capture, animation, neural networks}, month={May}, publisher={John Wiley \& Sons}, volume={39}, number={2}, pages={487--496}, doi={10.1111/cgf.13946}, year={2020} } @article{sarfjoo2019transformation, title={Transformation of low-quality device-recorded speech to high-quality speech using improved {SEGAN} model}, author={Sarfjoo, Seyyed Saeed and Wang, Xin and Henter, Gustav Eje and Lorenzo-Trueba, Jaime and Takaki, Shinji and Yamagishi, Junichi}, journal={arXiv preprint arXiv:1911.03952}, abstract={Nowadays vast amounts of speech data are recorded from low-quality recorder devices such as smartphones, tablets, laptops, and medium-quality microphones. The objective of this research was to study the automatic generation of high-quality speech from such low-quality device-recorded speech, which could then be applied to many speech-generation tasks. In this paper, we first introduce our new device-recorded speech dataset then propose an improved end-to-end method for automatically transforming the low-quality device-recorded speech into professional high-quality speech. Our method is an extension of a generative adversarial network (GAN)-based speech enhancement model called speech enhancement GAN (SEGAN), and we present two modifications to make model training more robust and stable. Finally, from a large-scale listening test, we show that our method can significantly enhance the quality of device-recorded speech signals.}, keywords={audio transformation, speech enhancement, generative adversarial network, speech synthesis}, month={Nov.}, year={2019} } @book{leijon2012pattern, title={Pattern Recognition: Fundamental Theory and Exercise Problems}, author={Leijon, Arne and Henter, Gustav Eje}, address={Stockholm, Sweden}, note={2015 ed.}, publisher={School of Electrical Engineering, KTH Royal Institute of Technology}, year={2012} } @article{henter2016kernel, title={Kernel Density Estimation-Based {M}arkov Models with Hidden State}, author={Henter, Gustav Eje and Leijon, Arne and Kleijn, W. Bastiaan}, journal={arXiv preprint arXiv:1807.11320}, abstract={We consider Markov models of stochastic processes where the next-step conditional distribution is defined by a kernel density estimator (KDE), similar to Markov forecast densities and certain time-series bootstrap schemes. The KDE Markov models (KDE-MMs) we discuss are nonlinear, nonparametric, fully probabilistic representations of stationary processes, based on techniques with strong asymptotic consistency properties. The models generate new data by concatenating points from the training data sequences in a context-sensitive manner, together with some additive driving noise. We present novel EM-type maximum-likelihood algorithms for data-driven bandwidth selection in KDE-MMs. Additionally, we augment the KDE-MMs with a hidden state, yielding a new model class, KDE-HMMs. The added state variable captures non-Markovian long memory and signal structure (e.g., slow oscillations), complementing the short-range dependences described by the Markov process. The resulting joint Markov and hidden-Markov structure is appealing for modelling complex real-world processes such as speech signals. We present guaranteed-ascent EM-update equations for model parameters in the case of Gaussian kernels, as well as relaxed update formulas that greatly accelerate training in practice. Experiments demonstrate increased held-out set probability for KDE-HMMs on several challenging natural and synthetic data series, compared to traditional techniques such as autoregressive models, HMMs, and their combinations.}, keywords={hidden Markov models, nonparametric methods, kernel density estimation, autoregressive models, time-series bootstrap}, month={July}, year={2018} } @inproceedings{szekely2019how, title={How to train your fillers: uh and um in spontaneous speech synthesis}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. SSW}, abstract={Using spontaneous conversational speech for TTS raises questions on how disfluencies such as filled pauses (FPs) should be approached. Detailed annotation of FPs in training data enables precise control at synthesis time; coarse or nonexistent FP annotation, when combined with stochastic attention-based neural TTS, leads to synthesisers that insert these phenomena into fluent prompts on their own accord. In this study we investigate, objectively and subjectively, the effects of FP annotation and the impact of relinquishing control over FPs in a Tacotron TTS system. The training corpus comprised 9 hours of single-speaker breath groups extracted from a conversational podcast. Systems trained with no or location-only FP annotation were found to reproduce FP locations and types (uh/um) in a pattern broadly similar to that of the corpus. We also studied the effect of FPs on natural and synthetic speech rate and the interchangeability of FP types. Interestingly, subjective tests indicate that synthesiser-predicted FP types from location-only annotation often were preferred over specifying the ground-truth type. In contrast, a more precise annotation, allowing us to focus training on the most fluent parts of the corpus, improved rated naturalness when synthesising fluent speech.}, keywords={speech synthesis, spontaneous speech, filled pauses, disfluencies}, address={Vienna, Austria}, month={Sept.}, publisher={ISCA}, volume={10}, pages={245--250}, doi={10.21437/SSW.2019-44}, year={2019} } @inproceedings{wagner2019speech, title={Speech synthesis evaluation -- State-of-the-art assessment and suggestion for a novel research program}, author={Wagner, Petra and Beskow, Jonas and Betz, Simon and Edlund, Jens and Gustafson, Joakim and Henter, Gustav Eje and Le Maguer, S{\'e}bastien and Malisz, Zofia and Sz{\'e}kely, {\'E}va and T\r{a}nnander, Christina and Vo{\ss}e, Jana}, booktitle={Proc. SSW}, abstract={Speech synthesis applications have become an ubiquity, in navigation systems, digital assistants or as screen or audio book readers. Despite their impact on the acceptability of the systems in which they are embedded, and despite the fact that different applications probably need different types of TTS voices, TTS evaluation is still largely treated as an isolated problem. Even though there is strong agreement among researchers that the mainstream approaches to Text-to-Speech (TTS) evaluation are often insufficient and may even be misleading, there exist few clear-cut suggestions as to (1) how TTS evaluations may be realistically improved on a large scale, and (2) how such improvements may lead to an informed feedback for system developers and, ultimately, better systems relying on TTS. This paper reviews the current state-of-the-art in TTS evaluation, and suggests a novel user-centered research program for this area.}, address={Vienna, Austria}, month={Sept.}, publisher={ISCA}, volume={10}, pages={105--110}, doi={10.21437/SSW.2019-19}, year={2019} } @inproceedings{watts2019where, title={Where do the improvements come from in sequence-to-sequence neural {TTS}?}, author={Watts, Oliver and Henter, Gustav Eje and Fong, Jason and Valentini-Botinhao, Cassia}, booktitle={Proc. SSW}, abstract={Sequence-to-sequence neural networks with attention mechanisms have recently been widely adopted for text-to-speech. Compared with older, more modular statistical parametric synthesis systems, sequence-to-sequence systems feature three prominent innovations: 1) They replace substantial parts of traditional fixed front-end processing pipelines (like Festival's) with learned text analysis; 2) They jointly learn to align text and speech and to synthesise speech audio from text; 3) They operate autoregressively on previously-generated acoustics. Naturalness improvements have been reported relative to earlier systems which do not contain these innovations. It would be useful to know how much each of the various innovations contribute to the improved performance. We here propose one way of associating the separately-learned components of a representative older modular system, specifically Merlin, with the different sub-networks within recent neural sequence-to-sequence architectures, specifically Tacotron 2 and DCTTS. This allows us to swap in and out various components and subnets to produce intermediate systems that step between the two paradigms; subjective evaluation of these systems then allows us to isolate the perceptual effects of the various innovations. We report on the design, evaluation, and findings of such an experiment.}, keywords={speech synthesis, end-to-end, SPSS, naturalness}, address={Vienna, Austria}, month={Sept.}, publisher={ISCA}, volume={10}, pages={217--222}, doi={10.21437/SSW.2019-39}, year={2019} } @inproceedings{szekely2019cuff, title={Off the cuff: Exploring extemporaneous speech delivery with {TTS}}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. Interspeech}, abstract={Extemporaneous speech is a delivery type in public speaking which uses a structured outline but is otherwise delivered conversationally, off the cuff. This demo uses a natural-sounding spontaneous conversational speech synthesiser to simulate this delivery style. We resynthesised the beginnings of two Interspeech keynote speeches with TTS that produces multiple different versions of each utterance that vary in fluency and filled-pause placement. The platform allows the user to mark the samples according to any perceptual aspect of interest, such as certainty, authenticity, confidence, etc. During the speech delivery, they can decide on the fly which realisation to play, addressing their audience in a connected, conversational fashion. Our aim is to use this platform to explore speech synthesis evaluation options from a production perspective and in situational contexts.}, keywords={spontaneous speech synthesis, public speaking, speech synthesis evaluation, filled pauses, AAC, soundboard}, address={Graz, Austria}, month={Sept.}, publisher={ISCA}, volume={20}, pages={3687--3688}, url={https://www.isca-speech.org/archive/interspeech\_2019/szekely19\_interspeech.html}, year={2019} } @inproceedings{szekely2019spontaneous, title={Spontaneous conversational speech synthesis from found data}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. Interspeech}, abstract={Synthesising spontaneous speech is a difficult task due to disfluencies, high variability and syntactic conventions different from those of written language. Using found data, as opposed to lab-recorded conversations, for speech synthesis adds to these challenges because of overlapping speech and the lack of control over recording conditions. In this paper we address these challenges by using a speaker-dependent CNN-LSTM breath detector to separate continuous recordings into utterances, which we here apply to extract nine hours of clean single-speaker breath groups from a conversational podcast. The resulting corpus is transcribed automatically (both lexical items and filler tokens) and used to build several voices on a Tacotron 2 architecture. Listening tests show: i) pronunciation accuracy improved with phonetic input and transfer learning; ii) it is possible to create a more fluent conversational voice by training on data without filled pauses; and iii) the presence of filled pauses improved perceived speaker authenticity. Another listening test showed the found podcast voice to be more appropriate for prompts from both public speeches and casual conversations, compared to synthesis from found read speech and from a manually transcribed lab-recorded spontaneous conversation.}, keywords={speech synthesis, conversational speech, spontaneous speech, hesitations, disfluencies, found data}, address={Graz, Austria}, month={Sept.}, publisher={ISCA}, volume={20}, pages={4435--4439}, doi={10.21437/Interspeech.2019-2836}, year={2019} } @inproceedings{kucherenko2019analyzing, title={Analyzing Input and Output Representations for Speech-Driven Gesture Generation}, author={Kucherenko, Taras and Hasegawa, Dai and Henter, Gustav Eje and Kaneko, Naoshi and Kjellstr{\"o}m, Hedvig}, booktitle={Proc. IVA}, abstract={This paper presents a novel framework for automatic speech-driven gesture generation, applicable to human-agent interaction including both virtual agents and robots. Specifically, we extend recent deep-learning-based, data-driven methods for speech-driven gesture generation by incorporating representation learning. Our model takes speech as input and produces gestures as output, in the form of a sequence of 3D coordinates. Our approach consists of two steps. First, we learn a lower-dimensional representation of human motion using a denoising autoencoder neural network, consisting of a motion encoder MotionE and a motion decoder MotionD. The learned representation preserves the most important aspects of the human pose variation while removing less relevant variation. Second, we train a novel encoder network SpeechE to map from speech to a corresponding motion representation with reduced dimensionality. At test time, the speech encoder and the motion decoder networks are combined: SpeechE predicts motion representations based on a given speech signal and MotionD then decodes these representations to produce motion sequences. We evaluate different representation sizes in order to find the most effective dimensionality for the representation. We also evaluate the effects of using different speech features as input to the model. We find that mel-frequency cepstral coefficients (MFCCs), alone or combined with prosodic features, perform the best. The results of a subsequent user study confirm the benefits of the representation learning.}, keywords={gesture generation, social robotics, representation learning, neural network, deep learning, virtual agents}, address={Paris, France}, month={July}, publisher={ACM}, volume={19}, pages={97--104}, doi={10.1145/3308532.3329472}, year={2019} } @inproceedings{szekely2019casting, title={Casting to corpus: Segmenting and selecting spontaneous dialogue for {TTS} with a {CNN}-{LSTM} speaker-dependent breath detector}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Gustafson, Joakim}, booktitle={Proc. ICASSP}, abstract={This paper considers utilising breaths to create improved spontaneous-speech corpora for conversational text-to-speech from found audio recordings such as dialogue podcasts. Breaths are of interest since they relate to prosody and speech planning and are independent of language and transcription. Specifically, we propose a semi-supervised approach where a fraction of coarsely annotated data is used to train a convolutional and recurrent speaker-specific breath detector operating on spectrograms and zero-crossing rate. The classifier output is used to find target-speaker breath groups (audio segments delineated by breaths) and subsequently select those that constitute clean utterances appropriate for a synthesis corpus. An application to 11 hours of raw podcast audio extracts 1969 utterances (106 minutes), 87\% of which are clean and correctly segmented. This outperforms a baseline that performs integrated VAD and speaker attribution without accounting for breaths.}, keywords={spontaneous speech, found data, speech synthesis corpora, breath detection, computational paralinguistics}, address={Brighton, UK}, month={May}, publisher={IEEE}, volume={44}, pages={6925--6929}, doi={10.1109/ICASSP.2019.8683846}, year={2019} } @inproceedings{kucherenko2019importance, title={On the Importance of Representations for Speech-Driven Gesture Generation}, author={Kucherenko, Taras and Hasegawa, Dai and Kaneko, Naoshi and Henter, Gustav Eje and Kjellstr{\"o}m, Hedvig}, booktitle={Proc. AAMAS}, abstract={This paper presents a novel framework for automatic speech-driven gesture generation applicable to human-agent interaction, including both virtual agents and robots. Specifically, we extend recent deep-learning-based, data-driven methods for speech-driven gesture generation by incorporating representation learning. Our model takes speech features as input and produces gestures in the form of sequences of 3D joint coordinates representing motion as output. The results of objective and subjective evaluations confirm the benefits of the representation learning.}, keywords={gesture generation, social robotics, representation learning, neural network, deep learning, virtual agents}, address={Montreal, QC}, month={May}, publisher={IFAAMAS}, volume={18}, pages={2072--2074}, url={https://dl.acm.org/citation.cfm?id=3306127.3332014}, year={2019} } @inproceedings{szekely2019ukspeech, title={Spontaneous conversational {TTS} from found data}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. UK Speech}, abstract={Most of human speech occurs in spontaneous conversation, making it an important goal to replicate such speech with text-to-speech (TTS). Using spontaneous conversational speech data in synthesis is however a challenge due to disfluencies, syntactic differences from written language, and general high variability. Moreover, building synthesisers from genuine spontaneous conversations found in the wild (as opposed to conversations elicited and recorded in the lab) brings further complications such as overlapping speech, lack of transcriptions, and no control over recording conditions. Taken together, these challenges mean that synthesis of conversational spontaneous speech from found data has seldom, if ever, been attempted before. We have previously proposed to address some of the above issues by using deep learning to automatically identify and extract single-speaker breath groups (segments of speech bookended by breaths). In this study we build several Tacotron 2 voices on a corpus of 9 hours of clean single-speaker US English breath groups from a conversational podcast and transcribed using off-the-shelf ASR. Our findings from listening tests on these voices include: 1) Phonetic instead of graphemic input improved pronunciation accuracy, as did transfer learning from a larger read-speech corpus. 2) If filler tokens are left untranscribed, the stochastic synthesis will spontaneously insert filled pauses (FPs) into the output with an FP distribution broadly similar to that in the training corpus. With filler tokens transcribed, FPs are only synthesised when requested. Thus control over output FPs is possible but optional. 3) The presence of filled pauses improved perceived speaker authenticity when synthesising a sequence of extemporaneous prompts. 4) More fluent conversational TTS can be achieved by omitting disfluent utterances from the training corpus. 5) When speaking spontaneous prompts (from public speeches as well as causal conversation), our new voices were preferred over both read-speech synthesis from found data and spontaneous-speech synthesis from a small, carefully transcribed, lab-recorded corpus of spontaneous conversational speech.}, address={Birmingham, UK}, month={June}, volume={8}, pages={31}, url={http://www.thespeechark.com/ukspeech2019/abstractBook\_UKSpeech2019\_fin\_noLogos.pdf}, year={2019} } @inproceedings{watts2019ukspeech, title={Sequence-to-sequence neural {TTS}: an assessment of the contribution of various ingredients}, author={Watts, Oliver and Henter, Gustav Eje and Fong, Jason and Valentini-Botinhao, Cassia}, booktitle={Proc. UK Speech}, abstract={Sequence-to-sequence neural networks with attention mechanisms have recently been widely adopted for text-to-speech. Compared with older, more modular statistical parametric synthesis systems, sequence-to-sequence systems feature three prominent innovations: 1) They replace substantial parts of traditional fixed front-end processing pipelines (like Festival's) with learned text analysis; 2) They jointly learn to align text and speech and to synthesise speech audio from text; 3) They operate autoregressively on previously-generated acoustics. Performance improvements have been reported relative to earlier systems which do not contain these innovations. It would be useful to know how much each of the various innovations contribute to the improved performance. We here propose one way of associating the separately-learned components of a representative older modular system, specifically Merlin, with the different sub-networks within recent neural sequence-to-sequence architectures, specifically Tacotron 2 and DCTTS. This allows us to swap in and out various components and subnets to produce intermediate systems that step between the two paradigms; subjective evaluation of these systems then allows us to isolate the perceptual effects of the various innovations. We report on the design, evaluation, and findings of such an experiment.}, address={Birmingham, UK}, month={June}, volume={8}, pages={64}, url={http://www.thespeechark.com/ukspeech2019/abstractBook\_UKSpeech2019\_fin\_noLogos.pdf}, year={2019} } @inproceedings{malisz2019ukspeech, title={Modern speech synthesis and its implications for speech sciences}, author={Malisz, Zofia and Henter, Gustav Eje and Valentini-Botinhao, Cassia and Watts, Oliver and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. UK Speech}, abstract={Speech technology (e.g., speech synthesis) and speech sciences (e.g., phonetics) depend on an ongoing dialogue that benefits both fields. Insights into speech production, like source-filter separation, and perception, like the mel scale, were for example central in the development of classical formant-based synthesis technology and remain important also today. Speech sciences have also contributed towards advanced synthetic-speech evaluation methods. In return, milestones in phonetics such as evidence for categorical perception as well as advances like the motor theory of speech perception and acoustic cue analysis have relied on support from experiments on synthesised speech. However, in recent decades the two fields have grown apart: Speech technologists have primarily pursued increasingly natural-sounding synthesis, relinquishing precise output control in the process. Speech scientists and phoneticians, meanwhile, have remained reliant on legacy synthesisers, since only these provide the careful output control necessary for phonetic studies. Unfortunately, a body of research has over the years identified substantial perceptual differences between natural speech and classical formant synthesis, casting doubt on speech-science findings from synthetic speech. Recently, breakthroughs in deep learning have fuelled a rapid acceleration of speech-technology capabilities. In this work, we argue that modern speech synthesis with deep learning in fact has the potential to address both of the two key concerns of speech scientists -- control and realism -- by 1) bringing back precise control over synthetic-speech output and 2) significantly closing the perceptual gap between natural and synthetic speech. Both claims find support in recent research in speech-synthesis technology. We supplement our two claims with an empirical evaluation contrasting classic rule-based formant synthesis (OVE III) against state-of-the-art synthesis methods, specifically speech-in-speech-out copy synthesis (MagPhase and Griffin-Lim), DNN-based statistical parametric text-to-speech (Merlin), and sequence-to-sequence neural TTS (DCTTS). The systems are compared in terms of subjective naturalness ratings as well as on a behavioural measure (response times in a lexical decision task). We find that all modern methods vastly improve on formant synthesis naturalness and are rated above OVE III at least 99\% of the time. Moreover, response times for copy-synthesis and Merlin are found not to differ notably from response times to natural speech, meaning that the troubling processing gap of older systems (including OVE III) is no longer evident. In light of these findings and the parallel advances in synthesis control, the time is ripe for phonetics researchers to consider what modern speech-synthesis technology can do for their research problems.}, address={Birmingham, UK}, month={June}, volume={8}, pages={11--12}, url={http://www.thespeechark.com/ukspeech2019/abstractBook\_UKSpeech2019\_fin\_noLogos.pdf}, year={2019} } @inproceedings{malisz2019speech, title={The speech synthesis phoneticians need is both realistic and controllable}, author={Malisz, Zofia and Henter, Gustav Eje and Valentini-Botinhao, Cassia and Watts, Oliver and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. FONETIK}, abstract={We discuss the circumstances that have led to a disjoint advancement of speech synthesis and phonetics in recent decades. The difficulties mainly rest on the pursuit of orthogonal goals by the two fields: realistic vs. controllable synthetic speech. We make a case for realising the promise of speech technologies in areas of speech sciences by developing control of neural speech synthesis and bringing the two areas into dialogue again.}, address={Stockholm, Sweden}, month={June}, publisher={Stockholm University}, volume={31}, pages={103--107}, doi={10.5281/zenodo.3246014}, year={2019} } @misc{szekely2019fonetik, title={Spontaneous conversational speech synthesis: The making of a podcast voice -- breathing, uhs \& ums and some ponderings about appropriateness}, author={Sz{\'e}kely, {\'E}va and Henter, Gustav Eje and Beskow, Jonas and Gustafson, Joakim}, howpublished={Show \& tell at \textit{FONETIK}}, address={Stockholm, Sweden}, month={June}, volume={31}, year={2019} } @inproceedings{lorenzo2017analyzing, title={Analyzing the impact of including listener perception annotations in {RNN}-based emotional speech synthesis}, author={Lorenzo-Trueba, Jaime and Henter, Gustav Eje and Takaki, Shinji and Yamagishi, Junichi}, booktitle={IPSJ SIG Tech. Rep.}, abstract={This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions? Second, how should the emotional information be represented as labels for supervised DNN training? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowd-sourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates.}, address={Tokyo, Japan}, month={Dec.}, publisher={IPSJ}, volume={2017-SLP-119}, number={8}, pages={1--2}, url={https://ipsj.ixsq.nii.ac.jp/ej/?action=repository\_uri\&item\_id=184864}, year={2017} } @inproceedings{henter2018generating, title={Generating segment-level foreign-accented synthetic speech with natural speech prosody}, author={Henter, Gustav Eje and Lorenzo-Trueba, Jaime and Wang, Xin and Kondo, Mariko and Yamagishi, Junichi}, booktitle={IPSJ SIG Tech. Rep.}, abstract={We present a new application of deep-learning-based TTS, namely multilingual speech synthesis for generating controllable foreign accent. We train an acoustic model on non-accented multilingual speech recordings from the same speaker and interpolate quinphone linguistic features between languages to generate microscopic foreign accent. By copying pitch and durations from a pre-recorded utterance of the desired prompt, natural prosody is achieved. We call this paradigm "cyborg speech" as it combines human and machine speech parameters. Experiments on synthetic American-English-accented Japanese confirm the success of the approach.}, keywords={controllable speech synthesis, foreign accent, multilingual speech synthesis, speech perception}, address={Tsukuba, Japan}, month={Jan.}, publisher={IPSJ}, volume={2018-SLP-120}, number={8}, pages={1--3}, url={https://ipsj.ixsq.nii.ac.jp/ej/?action=repository\_uri\&item\_id=185801}, year={2018} } @inproceedings{henter2017nonparametric, title={Non-parametric duration modelling for speech synthesis with a joint model of acoustics and duration}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and King, Simon}, booktitle={IEICE Tech. Rep.}, abstract={We describe a new approach to duration modelling for statistical parametric speech synthesis, in which a statistical model is trained to output a phone transition probability at each time unit. Unlike conventional duration modelling -- which assumes that duration distributions have a particular shape and use the mean of that distribution for synthesis -- our approach can in principle model any distribution supported on the positive integers. Generation from this model can be performed in many ways; here we consider output generation based on the median or other quantiles of the predicted duration. Compared to conventional mean durations, the median is more typical (more probable), is robust to training-data irregularities, and enables incremental generation. Furthermore, our approach is consistent with a longer-term goal of modelling durations and acoustic features together. Results indicate that the proposed method is competitive with baseline approaches in approximating the median duration of held-out natural speech. We also discuss extensions that allow iterative realignment and adjusting the global speech rate.}, keywords={text-to-speech, speech synthesis, duration modelling, non-parametric models, LSTMs}, address={Tokyo, Japan}, month={Jan.}, publisher={IEICE}, volume={116}, number={414}, pages={11--16}, url={https://www.ieice.org/ken/paper/20170121lbp6/}, year={2017} } @inproceedings{henter2016asaajs, title={Robust text-to-speech duration modelling with a deep neural network}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon}, booktitle={Proc. ASA/ASJ}, abstract={Accurate modeling and prediction of speech-sound durations is important for generating more natural synthetic speech. Deep neural networks (DNNs) offer powerful models, and large, found corpora of natural speech are easily acquired for training them. Unfortunately, poor quality control (e.g., transcription errors) and phenomena such as reductions and filled pauses complicate duration modelling from found speech data. To mitigate issues caused by these idiosyncrasies, we propose to incorporate methods from robust statistics into speech synthesis. Robust methods can disregard ill-fitting training-data points---errors or other outliers---to describe the typical case better. For instance, parameter estimation can be made robust by replacing maximum likelihood with a robust estimation criterion based on the density power divergence (a.k.a. the beta-divergence). Alternatively, a standard approximation for output generation with mixture density networks (MDNs) can be interpreted as a robust output generation heuristic. To evaluate the potential benefits of robust techniques, we adapted data from a free online audiobook to build several DNN-based text-to-speech systems, with either conventional or robust duration prediction. Our objective results indicate that robust methods described typical durations better than the baselines. Additionally, listeners significantly preferred synthetic speech generated using the robust methods in a subjective evaluation.}, keywords={speech synthesis, quality assurance, cognitive science, speech sounds, artificial neural networks, speech communication}, address={Honolulu, HI}, month={Nov.}, publisher={ASA}, volume={140}, number={4}, pages={2961}, doi={10.1121/1.4969147}, year={2016} } @inproceedings{henter2016robust, title={Robust {TTS} duration modelling using {DNN}s}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon}, booktitle={Proc. ICASSP}, abstract={Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and expressive speech are easy to acquire for training them. Unfortunately, found datasets are seldom subject to the quality-control that traditional synthesis methods expect. Common issues likely to affect duration modelling include transcription errors, reductions, filled pauses, and forced-alignment inaccuracies. To combat this, we propose to improve modelling and prediction of speech durations using methods from robust statistics, which are able to disregard ill-fitting points in the training material. We describe a robust fitting criterion based on the density power divergence (the beta-divergence) and a robust generation heuristic using mixture density networks (MDNs). Perceptual tests indicate that subjects prefer synthetic speech generated using robust models of duration over the baselines.}, keywords={speech synthesis, duration modelling, robust statistics}, address={Shanghai, China}, month={Mar.}, publisher={IEEE}, volume={41}, pages={5130--5134}, doi={10.1109/ICASSP.2016.7472655}, year={2016} } @inproceedings{ronanki2016template, title={A Template-Based Approach for Speech Synthesis Intonation Generation Using {LSTM}s}, author={Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon}, booktitle={Proc. Interspeech}, abstract={The absence of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems use regression techniques to predict the fundamental frequency (F0) frame-by-frame. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, this paper proposes a template-based approach for automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically learned set) are predicted by a recurrent neural network (RNN). The use of syllable templates mitigates the over-smoothing problem and is able to reproduce pitch patterns observed in the data. The use of an RNN, paired with connectionist temporal classification (CTC), enables the prediction of structure in the pitch contour spanning the entire utterance. This novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and the other acoustic features, to construct a complete text-to-speech system. We report the results of objective and subjective tests on an expressive speech corpus of children's audiobooks, and include comparisons to a conventional baseline that predicts F0 directly at the frame level.}, keywords={speech synthesis, intonation modelling, F0 templates, LSTM, CTC}, address={San Francisco, CA}, month={Sept.}, publisher={ISCA}, volume={17}, pages={2463--2467}, doi={10.21437/Interspeech.2016-96}, year={2016} } @inproceedings{henter2014ukspeech, title={Measuring the perceptual effects of speech synthesis modelling assumptions}, author={Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon}, booktitle={Proc. UK Speech}, abstract={Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.}, address={Edinburgh, UK}, month={June}, volume={3}, pages={37}, url={https://ukspeech.inf.ed.ac.uk/wp-content/uploads/2019/06/uk-speech-2014-abstract-book.pdf}, year={2014} } @article{henter2018deep, title={Deep Encoder-Decoder Models for Unsupervised Learning of Controllable Speech Synthesis}, author={Henter, Gustav Eje and Lorenzo-Trueba, Jaime and Wang, Xin and Yamagishi, Junichi}, journal={arXiv preprint arXiv:1807.11470}, abstract={Generating versatile and appropriate synthetic speech requires control over the output expression separate from the spoken text. Important non-textual speech variation is seldom annotated, in which case output control must be learned in an unsupervised fashion. In this paper, we perform an in-depth study of methods for unsupervised learning of control in statistical speech synthesis. For example, we show that popular unsupervised training heuristics can be interpreted as variational inference in certain autoencoder models. We additionally connect these models to VQ-VAEs, another, recently-proposed class of deep variational autoencoders, which we show can be derived from a very similar mathematical argument. The implications of these new probabilistic interpretations are discussed. We illustrate the utility of the various approaches with an application to acoustic modelling for emotional speech synthesis, where the unsupervised methods for learning expression control (without access to emotional labels) are found to give results that in many aspects match or surpass the previous best supervised approach.}, keywords={controllable speech synthesis, latent variable models, autoencoders, variational inference, VQ-VAE}, month={July}, year={2018} } @article{phan2017consensus, title={Consensus-based sequence training for video captioning}, author={Phan, Sang and Henter, Gustav Eje and Miyao, Yusuke and Satoh, Shin'ichi}, journal={arXiv preprint arXiv:1712.09532}, abstract={Captioning models are typically trained using the cross-entropy loss. However, their performance is evaluated on other metrics designed to better correlate with human assessments. Recently, it has been shown that reinforcement learning (RL) can directly optimize these metrics in tasks such as captioning. However, this is computationally costly and requires specifying a baseline reward at each step to make training converge. We propose a fast approach to optimize one's objective of interest through the REINFORCE algorithm. First we show that, by replacing model samples with ground-truth sentences, RL training can be seen as a form of weighted cross-entropy loss, giving a fast, RL-based pre-training algorithm. Second, we propose to use the consensus among ground-truth captions of the same video as the baseline reward. This can be computed very efficiently. We call the complete proposal Consensus-based Sequence Training (CST). Applied to the MSRVTT video captioning benchmark, our proposals train significantly faster than comparable methods and establish a new state-of-the-art on the task, improving the CIDEr score from 47.3 to 54.2.}, month={Dec.}, year={2017} } @article{henter2016analysing, title={Analysing shortcomings of statistical parametric speech synthesis}, author={Henter, Gustav Eje and King, Simon and Merritt, Thomas and Degottex, Gilles}, journal={arXiv preprint arXiv:1807.10941}, abstract={Output from statistical parametric speech synthesis (SPSS) remains noticeably worse than natural speech recordings in terms of quality, naturalness, speaker similarity, and intelligibility in noise. There are many hypotheses regarding the origins of these shortcomings, but these hypotheses are often kept vague and presented without empirical evidence that could confirm and quantify how a specific shortcoming contributes to imperfections in the synthesised speech. Throughout speech synthesis literature, surprisingly little work is dedicated towards identifying the perceptually most important problems in speech synthesis, even though such knowledge would be of great value for creating better SPSS systems. In this book chapter, we analyse some of the shortcomings of SPSS. In particular, we discuss issues with vocoding and present a general methodology for quantifying the effect of any of the many assumptions and design choices that hold SPSS back. The methodology is accompanied by an example that carefully measures and compares the severity of perceptual limitations imposed by vocoding as well as other factors such as the statistical model and its use.}, month={July}, year={2018} } @inproceedings{malisz2019modern, title={Modern speech synthesis for phonetic sciences: a discussion and an evaluation}, author={Malisz, Zofia and Henter, Gustav Eje and Valentini-Botinhao, Cassia and Watts, Oliver and Beskow, Jonas and Gustafson, Joakim}, booktitle={Proc. ICPhS}, abstract={Decades of gradual advances in speech synthesis have recently culminated in exponential improvements fuelled by deep learning. This quantum leap has the potential to finally deliver realistic, controllable, and robust synthetic stimuli for speech experiments. In this article, we discuss these and other implications for phonetic sciences. We substantiate our argument by evaluating classic rule-based formant synthesis against state-of-the-art synthesisers on a) subjective naturalness ratings and b) a behavioural measure (reaction times in a lexical decision task). We also differentiate between text-to-speech and speech-to-speech methods. Naturalness ratings indicate that all modern systems are substantially closer to natural speech than formant synthesis. Reaction times for several modern systems do not differ substantially from natural speech, meaning that the processing gap observed in older systems, and reproduced with our formant synthesiser, is no longer evident. Importantly, some speech-to-speech methods are nearly indistinguishable from natural speech on both measures.}, keywords={speech synthesis, scientific methodology, speech technology}, address={Melbourne, Australia}, month={Aug.}, publisher={IPA}, volume={19}, pages={487--491}, url={http://intro2psycholing.net/ICPhS/papers/ICPhS\_536.pdf}, year={2019} } @inproceedings{yoshimura2016hierarchical, title={A Hierarchical Predictor of Synthetic Speech Naturalness Using Neural Networks}, author={Yoshimura, Takenori and Henter, Gustav Eje and Watts, Oliver and Wester, Mirjam and Yamagishi, Junichi and Tokuda, Keiichi}, booktitle={Proc. Interspeech}, abstract={A problem when developing and tuning speech synthesis systems is that there is no well-established method of automatically rating the quality of the synthetic speech. This research attempts to obtain a new automated measure which is trained on the result of large-scale subjective evaluations employing many human listeners, i.e., the Blizzard Challenge. To exploit the data, we experiment with linear regression, feed-forward and convolutional neural network models, and combinations of them to regress from synthetic speech to the perceptual scores obtained from listeners. The biggest improvements were seen when combining stimulus- and system-level predictions.}, keywords={speech synthesis, naturalness, neural network, Blizzard Challenge}, address={San Francisco, CA}, month={Sept.}, publisher={ISCA}, volume={17}, pages={342--346}, doi={10.21437/Interspeech.2016-847}, year={2016} } @inproceedings{ronanki2016ukspeech, title={A template-based approach for intonation generation using {LSTM}s}, author={Ronanki, Srikanth and Henter, Gustav Eje and Wu, Zhizheng and King, Simon}, booktitle={Proc. UK Speech}, abstract={The lack of convincing intonation makes current parametric speech synthesis systems sound dull and lifeless, even when trained on expressive speech data. Typically, these systems predict the fundamental frequency (F0) frame-by-frame using regression models. This approach leads to overly-smooth pitch contours and fails to construct an appropriate prosodic structure across the full utterance. In order to capture and reproduce larger-scale pitch patterns, we propose a classification-based approach to automatic F0 generation, where per-syllable pitch-contour templates (from a small, automatically-learned set) are predicted by a recurrent neural network (RNN). The use of templates mitigates the over-smoothing problem: with only six templates, we can reconstruct pitch patterns observed in the data well (small RMSE). The long memory of RNNs in principle enables the prediction of pitch-contour structure spanning the entire utterance. To construct a complete text-to-speech system, this novel F0 prediction system is used alongside separate LSTMs for predicting phone durations and remaining acoustic features. The objective results are encouraging, but listening tests with oracle reconstructions suggest that further work (beyond a simple smoothing) is necessary to reduce subjective artefacts in the template-based F0 reconstructions.}, address={Sheffield, UK}, month={June}, volume={5}, pages={22}, url={http://ukspeech.dcs.shef.ac.uk/ukspeech16\_abstracts.pdf}, year={2016} } @inproceedings{petkov2012enhancing, title={Enhancing Subjective Speech Intelligibility Using a Statistical Model of Speech}, author={Petkov, Petko N. and Kleijn, W. Bastiaan and Henter, Gustav Eje}, booktitle={Proc. Interspeech}, abstract={The intelligibility of speech in adverse noise conditions can be improved by modifying the characteristics of the clean speech prior to its presentation. An effective and flexible paradigm is to select the modification by optimizing a measure of objective intelligibility. Here we apply this paradigm at the text level and optimize a measure related to the classification error probability in an automatic speech recognition system. The proposed method was applied to a simple but powerful band-energy modification mechanism under an energy preservation constraint. Subjective evaluation results provide a clear indication of a significant gain in subjective intelligibility. In contrast to existing methods, the proposed approach is not restricted to a particular modification strategy and treats the notion of optimality at a level closer to that of subjective intelligibility. The computational complexity of the method is sufficiently low to enable its use in on-line applications.}, keywords={speech modification, subjective intelligibility, statistical model of speech}, address={Portland, OR}, month={Sept.}, publisher={ISCA}, volume={13}, pages={166--169}, doi={10.21437/Interspeech.2012-58}, year={2012} } @inproceedings{henter2017principles, title={Principles for learning controllable {TTS} from annotated and latent variation}, author={Henter, Gustav Eje and Lorenzo-Trueba, Jaime and Wang, Xin and Yamagishi, Junichi}, booktitle={Proc. Interspeech}, abstract={For building flexible and appealing high-quality speech synthesisers, it is desirable to be able to accommodate and reproduce fine variations in vocal expression present in natural speech. Synthesisers can enable control over such output properties by adding adjustable control parameters in parallel to their text input. If not annotated in training data, the values of these control inputs can be optimised jointly with the model parameters. We describe how this established method can be seen as approximate maximum likelihood and MAP inference in a latent variable model. This puts previous ideas of (learned) synthesiser inputs such as sentence-level control vectors on a more solid theoretical footing. We furthermore extend the method by restricting the latent variables to orthogonal subspaces via a sparse prior. This enables us to learn dimensions of variation present also within classes in coarsely annotated speech. As an example, we train an LSTM-based TTS system to learn nuances in emotional expression from a speech database annotated with seven different acted emotions. Listening tests show that our proposal successfully can synthesise speech with discernible differences in expression within each emotion, without compromising the recognisability of synthesised emotions compared to an identical system without learned nuances.}, keywords={text-to-speech, latent variables, paralinguistics}, address={Stockholm, Sweden}, month={Aug.}, publisher={ISCA}, volume={18}, pages={3956--3960}, doi={10.21437/Interspeech.2017-171}, year={2017} } @inproceedings{lorenzo2017misperceptions, title={Misperceptions of the emotional content of natural and vocoded speech in a car}, author={Lorenzo-Trueba, Jaime and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Yamagishi, Junichi}, booktitle={Proc. Interspeech}, abstract={This paper analyzes a) how often listeners interpret the emotional content of an utterance incorrectly when listening to vocoded or natural speech in adverse conditions; b) which noise conditions cause the most misperceptions; and c) which group of listeners misinterpret emotions the most. The long-term goal is to construct new emotional speech synthesizers that adapt to the environment and to the listener. We performed a large-scale listening test where over 400 listeners between the ages of 21 and 72 assessed natural and vocoded acted emotional speech stimuli. The stimuli had been artificially degraded using a room impulse response recorded in a car and various in-car noise types recorded in a real car. Experimental results show that the recognition rates for emotions and perceived emotional strength degrade as signal-to-noise ratio decreases. Interestingly, misperceptions seem to be more pronounced for negative and low-arousal emotions such as calmness or anger, while positive emotions such as happiness appear to be more robust to noise. An ANOVA analysis of listener meta-data further revealed that gender and age also influenced results, with elderly male listeners most likely to incorrectly identify emotions.}, keywords={emotional perception, speech in noise, emotion recognition, car noise}, address={Stockholm, Sweden}, month={Aug.}, publisher={ISCA}, volume={18}, pages={606--610}, doi={10.21437/Interspeech.2017-532}, year={2017} } @inproceedings{ronanki2016median, title={Median-Based Generation of Synthetic Speech Durations Using a Non-Parametric Approach}, author={Ronanki, Srikanth and Watts, Oliver and King, Simon and Henter, Gustav Eje}, booktitle={Proc. SLT}, abstract={This paper proposes a new approach to duration modelling for statistical parametric speech synthesis in which a recurrent statistical model is trained to output a phone transition probability at each timestep (acoustic frame). Unlike conventional approaches to duration modelling -- which assume that duration distributions have a particular form (e.g., a Gaussian) and use the mean of that distribution for synthesis -- our approach can in principle model any distribution supported on the non-negative integers. Generation from this model can be performed in many ways; here we consider output generation based on the median predicted duration. The median is more typical (more probable) than the conventional mean duration, is robust to training-data irregularities, and enables incremental generation. Furthermore, a frame-level approach to duration prediction is consistent with a longer-term goal of modelling durations and acoustic features together. Results indicate that the proposed method is competitive with baseline approaches in approximating the median duration of held-out natural speech.}, keywords={text-to-speech, speech synthesis, duration modelling, non-parametric models, LSTMs}, address={San Diego, CA}, month={Dec.}, publisher={IEEE}, volume={6}, pages={686--692}, doi={10.1109/SLT.2016.7846337}, year={2016} } @inproceedings{luong2017adapting, title={Adapting and Controlling {DNN}-based Speech Synthesis Using Input Codes}, author={Luong, Hieu-Thi and Takaki, Shinji and Henter, Gustav Eje and Yamagishi, Junichi}, booktitle={Proc. ICASSP}, abstract={Methods for adapting and controlling the characteristics of output speech are important topics in speech synthesis. In this work, we investigated the performance of DNN-based text-to-speech systems that in parallel to conventional text input also take speaker, gender, and age codes as inputs, in order to 1) perform multi-speaker synthesis, 2) perform speaker adaptation using small amounts of target-speaker adaptation data, and 3) modify synthetic speech characteristics based on the input codes. Using a large-scale, studio-quality speech corpus with 135 speakers of both genders and ages between tens and eighties, we performed three experiments: 1) First, we used a subset of speakers to construct a DNN-based, multi-speaker acoustic model with speaker codes. 2) Next, we performed speaker adaptation by estimating code vectors for new speakers via backpropagation from a small amount of adaptation material. 3) Finally, we experimented with manually manipulating input code vectors to alter the gender and/or age characteristics of the synthesised speech. Experimental results show that high-performance multi-speaker models can be constructed using the proposed code vectors with a variety of encoding schemes, and that adaptation and manipulation can be performed effectively using the codes.}, keywords={speech synthesis, DNNs, speaker adaptation, speech manipulation, voice morphing}, address={New Orleans, LA}, month={Mar.}, publisher={IEEE}, volume={42}, pages={4905--4909}, doi={10.1109/ICASSP.2017.7953089}, year={2017} } @inproceedings{henter2018cyborg, title={Cyborg speech: Deep multilingual speech synthesis for generating segmental foreign accent with natural prosody}, author={Henter, Gustav Eje and Lorenzo-Trueba, Jaime and Wang, Xin and Kondo, Mariko and Yamagishi, Junichi}, booktitle={Proc. ICASSP}, abstract={We describe a new application of deep-learning-based speech synthesis, namely multilingual speech synthesis for generating controllable foreign accent. Specifically, we train a DBLSTM-based acoustic model on non-accented multilingual speech recordings from a speaker native in several languages. By copying durations and pitch contours from a pre-recorded utterance of the desired prompt, natural prosody is achieved. We call this paradigm "cyborg speech" as it combines human and machine speech parameters. Segmentally accented speech is produced by interpolating specific quinphone linguistic features towards phones from the other language that represent non-native mispronunciations. Experiments on synthetic American-English-accented Japanese speech show that subjective synthesis quality matches monolingual synthesis, that natural pitch is maintained, and that naturalistic phone substitutions generate output that is perceived as having an American foreign accent, even though only non-accented training data was used.}, keywords={multilingual speech synthesis, phonetic manipulation, foreign accent, DNN}, address={Calgary, AB}, month={Apr.}, publisher={IEEE}, volume={43}, pages={4799--4803}, doi={10.1109/ICASSP.2018.8462470}, year={2018} } @inproceedings{lorenzo2017investigating, title={Investigating different representations for modeling multiple emotions in {DNN}-based speech synthesis}, author={Lorenzo-Trueba, Jaime and Henter, Gustav Eje and Takaki, Shinji and Yamagishi, Junichi and Morino, Yosuke and Ochiai, Yuta}, booktitle={Proc. International Workshop on Affective Social Multimedia Computing (ASMMC)}, abstract={This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions -- should we use the labels that the speaker intended to express, or labels based on listener perception of the resulting speech signals? Second, how should the emotional information be represented as labels for supervised DNN training, e.g., should emotional class and emotional strength be factorized into separate inputs or not? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowdsourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates and perceived emotional strength.}, keywords={emotional speech synthesis, deep neural network, recurrent neural networks}, address={Stockholm, Sweden}, month={Aug.}, volume={3}, year={2017} } @article{lorenzo2018investigating, title={Investigating different representations for modeling and controlling multiple emotions in {DNN}-based speech synthesis}, author={Lorenzo-Trueba, Jaime and Henter, Gustav Eje and Takaki, Shinji and Yamagishi, Junichi and Morino, Yosuke and Ochiai, Yuta}, journal={Speech Commun.}, abstract={In this paper, we investigate the simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions -- should we use the labels that the speaker intended to express, or labels based on listener perception of the resulting speech signals? Second, how should the emotional information be represented as labels for supervised DNN training, e.g., should emotional class and emotional strength be factorized into separate inputs or not? We evaluate on a large-scale corpus of emotional speech from a professional voice actress, additionally annotated with perceived emotional labels from crowdsourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates, perceived emotional strength, and subjective speech quality. Simultaneously, we also study which representations are most appropriate for controlling the emotional strength of synthetic speech.}, keywords={emotional speech synthesis, perception modeling, perceptual evaluation}, month={May}, publisher={Elsevier}, volume={99}, pages={135--143}, doi={10.1016/j.specom.2018.03.002}, year={2018} } @inproceedings{wester2016evaluating, title={Evaluating comprehension of natural and synthetic conversational speech}, author={Wester, Mirjam and Watts, Oliver and Henter, Gustav Eje}, booktitle={Speech Prosody}, abstract={Current speech synthesis methods typically operate on isolated sentences and lack convincing prosody when generating longer segments of speech. Similarly, prevailing TTS evaluation paradigms, such as intelligibility (transcription word error rate) or MOS, only score sentences in isolation, even though overall comprehension is arguably more important for speech-based communication. In an effort to develop more ecologically-relevant evaluation techniques that go beyond isolated sentences, we investigated comprehension of natural and synthetic speech dialogues. Specifically, we tested listener comprehension on long segments of spontaneous and engaging conversational speech (three 10-minute radio interviews of comedians). Interviews were reproduced either as natural speech, synthesised from carefully prepared transcripts, or synthesised using durations from forced-alignment against the natural speech, all in a balanced design. Comprehension was measured using multiple choice questions. A significant difference was measured between the comprehension/retention of natural speech (74\% correct responses) and synthetic speech with forced-aligned durations (61\% correct responses). However, no significant difference was observed between natural and regular synthetic speech (70\% correct responses). Effective evaluation of comprehension remains elusive.}, keywords={evaluation, comprehension, conversational speech, statistical parametric speech synthesis}, address={Boston, MA}, month={June}, publisher={ISCA}, volume={8}, pages={736--740}, doi={10.21437/SpeechProsody.2016-157}, year={2016} } @article{henter2016minimum, title={Minimum Entropy Rate Simplification of Stochastic Processes}, author={Henter, Gustav Eje and Kleijn, W. Bastiaan}, journal={IEEE T. Pattern Anal.}, abstract={We propose minimum entropy rate simplification (MERS), an information-theoretic, parameterization-independent framework for simplifying generative models of stochastic processes. Applications include improving model quality for sampling tasks by concentrating the probability mass on the most characteristic and accurately described behaviors while de-emphasizing the tails, and obtaining clean models from corrupted data (nonparametric denoising). This is the opposite of the smoothing step commonly applied to classification models. Drawing on rate-distortion theory, MERS seeks the minimum entropy-rate process under a constraint on the dissimilarity between the original and simplified processes. We particularly investigate the Kullback-Leibler divergence rate as a dissimilarity measure, where, compatible with our assumption that the starting model is disturbed or inaccurate, the simplification rather than the starting model is used for the reference distribution of the divergence. This leads to analytic solutions for stationary and ergodic Gaussian processes and Markov chains. The same formulas are also valid for maximum-entropy smoothing under the same divergence constraint. In experiments, MERS successfully simplifies and denoises models from audio, text, speech, and meteorology.}, keywords={Markov processes, stochastic processes, information theory, signal analysis synthesis and processing, language generation, statistical models}, month={Dec.}, publisher={IEEE}, volume={38}, number={12}, pages={2487--2500}, doi={10.1109/TPAMI.2016.2533382}, year={2016} } @phdthesis{henter2013probabilistic, title={Probabilistic Sequence Models with Speech and Language Applications}, author={Henter, Gustav Eje}, abstract={Series data, sequences of measured values, are ubiquitous. Whenever observations are made along a path in space or time, a data sequence results. To comprehend nature and shape it to our will, or to make informed decisions based on what we know, we need methods to make sense of such data. Of particular interest are probabilistic descriptions, which enable us to represent uncertainty and random variation inherent to the world around us. This thesis presents and expands upon some tools for creating probabilistic models of sequences, with an eye towards applications involving speech and language. Modelling speech and language is not only of use for creating listening, reading, talking, and writing machines---for instance allowing human-friendly interfaces to future computational intelligences and smart devices of today---but probabilistic models may also ultimately tell us something about ourselves and the world we occupy. The central theme of the thesis is the creation of new or improved models more appropriate for our intended applications, by weakening limiting and questionable assumptions made by standard modelling techniques. One contribution of this thesis examines causal-state splitting reconstruction (CSSR), an algorithm for learning discrete-valued sequence models whose states are minimal sufficient statistics for prediction. Unlike many traditional techniques, CSSR does not require the number of process states to be specified a priori, but builds a pattern vocabulary from data alone, making it applicable for language acquisition and the identification of stochastic grammars. A paper in the thesis shows that CSSR handles noise and errors expected in natural data poorly, but that the learner can be extended in a simple manner to yield more robust and stable results also in the presence of corruptions. Even when the complexities of language are put aside, challenges remain. The seemingly simple task of accurately describing human speech signals, so that natural synthetic speech can be generated, has proved difficult, as humans are highly attuned to what speech should sound like. Two papers in the thesis therefore study nonparametric techniques suitable for improved acoustic modelling of speech for synthesis applications. Each of the two papers targets a known-incorrect assumption of established methods, based on the hypothesis that nonparametric techniques can better represent and recreate essential characteristics of natural speech. In the first paper of the pair, Gaussian process dynamical models (GPDMs), nonlinear, continuous state-space dynamical models based on Gaussian processes, are shown to better replicate voiced speech, without traditional dynamical features or assumptions that cepstral parameters follow linear autoregressive processes. Additional dimensions of the state-space are able to represent other salient signal aspects such as prosodic variation. The second paper, meanwhile, introduces KDE-HMMs, asymptotically-consistent Markov models for continuous-valued data based on kernel density estimation, that additionally have been extended with a fixed-cardinality discrete hidden state. This construction is shown to provide improved probabilistic descriptions of nonlinear time series, compared to reference models from different paradigms. The hidden state can be used to control process output, making KDE-HMMs compelling as a probabilistic alternative to hybrid speech-synthesis approaches. A final paper of the thesis discusses how models can be improved even when one is restricted to a fundamentally imperfect model class. Minimum entropy rate simplification (MERS), an information-theoretic scheme for postprocessing models for generative applications involving both speech and text, is introduced. MERS reduces the entropy rate of a model while remaining as close as possible to the starting model. This is shown to produce simplified models that concentrate on the most common and characteristic behaviours, and provides a continuum of simplifications between the original model and zero-entropy, completely predictable output. As the tails of fitted distributions may be inflated by noise or empirical variability that a model has failed to capture, MERS's ability to concentrate on high-probability output is also demonstrated to be useful for denoising models trained on disturbed data.}, keywords={time series, acoustic modelling, speech synthesis, stochastic processes, causal-state splitting reconstruction, robust causal states, pattern discovery, Markov models, HMMs, nonparametric models, Gaussian processes, Gaussian process dynamical models, nonlinear Kalman filters, information theory, minimum entropy rate simplification, kernel density estimation, time-series bootstrap}, address={Stockholm, Sweden}, month={Dec.}, school={Communication Theory, School of Electrical Engineering, KTH Royal Institute of Technology}, number={TRITA-EE 2013:042}, url={https://www.diva-portal.org/smash/record.jsf?pid=diva2:667681}, year={2013} } @inproceedings{henter2016ukspeech, title={Robust text-to-speech duration modelling using {DNN}s}, author={Henter, Gustav Eje and Ronanki, Srikanth and Watts, Oliver and Wester, Mirjam and Wu, Zhizheng and King, Simon}, booktitle={Proc. UK Speech}, abstract={Accurate modelling and prediction of speech-sound durations is an important component in generating more natural synthetic speech. Deep neural networks (DNNs) offer a powerful modelling paradigm, and large, found corpora of natural and prosodically-rich speech are easy to acquire for training DNN models. Unfortunately, poor quality control (e.g., transcription errors) as well hard-to-predict phenomena such as reductions and filled pauses are likely to complicate duration modelling from found data. To mitigate issues caused by these idiosyncrasies, we propose to improve modelling and prediction of speech durations using methods from robust statistics. These are able to disregard ill-fitting points in the training material -- errors or other outliers -- in order to describe the typical case better. For instance, parameter estimation can be made robust by changing from maximum likelihood estimation (MLE) to a robust fitting criterion based on the density power divergence (a.k.a. the beta-divergence). Alternatively, the standard approximations for output generation with multi-component mixture density networks (MDNs) can be seen as a heuristic for robust output generation. To evaluate the potential benefits of robust techniques, we used 175 minutes of found data from a free audiobook to build several text-to-speech (TTS) systems with either conventional or robust DNN-based duration prediction. The objective results indicate that robust methods described typical speech durations better than the baselines. (Atypical, poorly predicted durations may be due to transcription errors, known to exist also in the test data, that make some forced-aligned durations unreliable.) Similarly, subjective evaluation using a hybrid MUSHRA/preference test with 21 listeners, each scoring 18 sets of same-sentence stimuli, found that listeners significantly preferred synthetic speech generated using robust methods over the baselines.}, address={Sheffield, UK}, month={June}, volume={5}, pages={44}, url={http://ukspeech.dcs.shef.ac.uk/ukspeech16\_abstracts.pdf}, year={2016} } @inproceedings{watts2016hmms, title={From {HMM}s to {DNN}s: where do the improvements come from?}, author={Watts, Oliver and Henter, Gustav Eje and Merritt, Thomas and Wu, Zhizheng and King, Simon}, booktitle={Proc. ICASSP}, abstract={Deep neural networks (DNNs) have recently been the focus of much text-to-speech research as a replacement for decision trees and hidden Markov models (HMMs) in statistical parametric synthesis systems. Performance improvements have been reported; however, the configuration of systems evaluated makes it impossible to judge how much of the improvement is due to the new machine learning methods, and how much is due to other novel aspects of the systems. Specifically, whereas the decision trees in HMM-based systems typically operate at the state-level, and separate trees are used to handle separate acoustic streams, most DNN-based systems are trained to make predictions simultaneously for all streams at the level of the acoustic frame. This paper isolates the influence of three factors (machine learning method; state vs. frame predictions; separate vs. combined stream predictions) by building a continuum of systems along which only a single factor is varied at a time. We find that replacing decision trees with DNNs and moving from state-level to frame-level predictions both significantly improve listeners' naturalness ratings of synthetic speech produced by the systems. No improvement is found to result from switching from separate-stream to combined-stream predictions.}, keywords={speech synthesis, hidden Markov model, decision tree, deep neural network}, address={Shanghai, China}, month={Mar.}, publisher={IEEE}, volume={41}, pages={5505--5509}, doi={10.1109/ICASSP.2016.7472730}, year={2016} } @inproceedings{dall2016testing, title={Testing the consistency assumption: Pronunciation variant forced alignment in read and spontaneous speech synthesis}, author={Dall, Rasmus and Brognaux, Sandrine and Richmond, Korin and Valentini-Botinhao, Cassia and Henter, Gustav Eje and Hirschberg, Julia and Yamagishi, Junichi and King, Simon}, booktitle={Proc. ICASSP}, abstract={Forced alignment for speech synthesis traditionally aligns a phoneme sequence predetermined by the front-end text processing system. This sequence is not altered during alignment, i.e., it is forced, despite possibly being faulty. The consistency assumption is the assumption that these mistakes do not degrade models, as long as the mistakes are consistent across training and synthesis. We present evidence that in the alignment of both standard read prompts and spontaneous speech this phoneme sequence is often wrong, and that this is likely to have a negative impact on acoustic models. A lattice-based forced alignment system allowing for pronunciation variation is implemented, resulting in improved phoneme identity accuracy for both types of speech. A perceptual evaluation of HMM-based voices showed that spontaneous models trained on this improved alignment also improved standard synthesis, despite breaking the consistency assumption.}, keywords={speech synthesis, TTS, forced alignment, HMM}, address={Shanghai, China}, month={Mar.}, publisher={IEEE}, volume={41}, pages={5155--5159}, doi={10.1109/ICASSP.2016.7472660}, year={2016} } @article{leijon2016bayesian, title={{B}ayesian Analysis of Phoneme Confusion Matrices}, author={Leijon, Arne and Henter, Gustav Eje and Dahlquist, Martin}, journal={IEEE/ACM T. Audio Speech}, abstract={This paper presents a parametric Bayesian approach to the statistical analysis of phoneme confusion matrices measured for groups of individual listeners in one or more test conditions. Two different bias problems in conventional estimation of mutual information are analyzed and explained theoretically. Evaluations with synthetic datasets indicate that the proposed Bayesian method can give satisfactory estimates of mutual information and response probabilities, even for phoneme confusion tests using a very small number of test items for each phoneme category. The proposed method can reveal overall differences in performance between two test conditions with better power than conventional Wilcoxon significance tests or conventional confidence intervals. The method can also identify sets of confusion-matrix cells that are credibly different between two test conditions, with better power than a similar approximate frequentist method.}, keywords={speech recognition, parameter estimation, mutual information, Bayes methods}, month={Mar.}, publisher={IEEE}, volume={24}, number={3}, pages={469--482}, doi={10.1109/TASLP.2015.2512039}, year={2016} } @article{henter2013picking, title={Picking up the pieces: Causal states in noisy data, and how to recover them}, author={Henter, Gustav Eje and Kleijn, W. Bastiaan}, journal={Pattern Recogn. Lett.}, abstract={Automatic structure discovery is desirable in many Markov model applications where a good topology (states and transitions) is not known a priori. CSSR is an established pattern discovery algorithm for stationary and ergodic stochastic symbol sequences that learns a predictively optimal Markov representation consisting of so-called causal states. By means of a novel algebraic criterion, we prove that the causal states of a simple process disturbed by random errors frequently are too complex to be learned fully, making CSSR diverge. In fact, the causal state representation of many hidden Markov models, representing simple but noise-disturbed data, has infinite cardinality. We also report that these problems can be solved by endowing CSSR with the ability to make approximations. The resulting algorithm, robust causal states (RCS), is able to recover the underlying causal structure from data corrupted by random substitutions, as is demonstrated both theoretically and in an experiment. The algorithm has potential applications in areas such as error correction and learning stochastic grammars.}, keywords={computational mechanics, causal states, CSSR, hidden Markov model, HMM, learnability}, month={Apr.}, publisher={Elsevier}, volume={34}, number={5}, pages={587--594}, doi={10.1016/j.patrec.2012.11.013}, year={2013} } @article{petkov2013maximizing, title={Maximizing phoneme recognition accuracy for enhanced speech intelligibility in noise}, author={Petkov, Petko N. and Henter, Gustav Eje and Kleijn, W. Bastiaan}, journal={IEEE T. Audio Speech}, abstract={An effective measure of speech intelligibility is the probability of correct recognition of the transmitted message. We propose a speech pre-enhancement method based on matching the recognized text to the text of the original message. The selected criterion is accurately approximated by the probability of the correct transcription given an estimate of the noisy speech features. In the presence of environment noise, and with a decrease in the signal-to-noise ratio, speech intelligibility declines. We implement a speech pre-enhancement system that optimizes the proposed criterion for the parameters of two distinct speech modification strategies under an energy-preservation constraint. The proposed method requires prior knowledge in the form of a transcription of the transmitted message and acoustic speech models from an automatic speech recognition system. Performance results from an open-set subjective intelligibility test indicate a significant improvement over natural speech and a reference system that optimizes a perceptual-distortion-based objective intelligibility measure. The computational complexity of the approach permits use in on-line applications.}, keywords={environment adaptation, intelligibility enhancement, speech pre-enhancement}, month={Feb.}, publisher={IEEE}, volume={21}, number={5}, pages={1035--1045}, doi={10.1109/TASL.2013.2244089}, year={2013} } @inproceedings{henter2010simplified, title={Simplified Probability Models for Generative Tasks: a Rate-Distortion Approach}, author={Henter, Gustav Eje and Kleijn, W. Bastiaan}, booktitle={Proc. EUSIPCO}, abstract={We consider using sparse simplifications to denoise probabilistic sequence models for generative tasks such as speech synthesis. Our proposal is to find the least random model that remains close to the original one according to a KL-divergence constraint, a technique we call minimum entropy rate simplification (MERS). This produces a representation-independent framework for trading off simplicity and divergence, similar to rate-distortion theory. Importantly, MERS uses the cleaned model rather than the original one for the underlying probabilities in the KL-divergence, effectively reversing the conventional argument order. This promotes rather than penalizes sparsity, suppressing uncommon outcomes likely to be errors. We write down the MERS equations for Markov chains, and present an iterative solution procedure based on the Blahut-Arimoto algorithm and a bigram matrix Markov chain representation. We apply the procedure to a music-based Markov grammar, and compare the results to a simplistic thresholding scheme.}, address={Aalborg, Denmark}, month={Aug.}, publisher={EURASIP}, volume={18}, pages={1159--1163}, url={https://www.eurasip.org/Proceedings/Eusipco/Eusipco2010/Contents/papers/1569292887.pdf}, year={2010} } @inproceedings{henter2011intermediate, title={Intermediate-state {HMM}s to capture continuously-changing signal features}, author={Henter, Gustav Eje and Kleijn, W. Bastiaan}, booktitle={Proc. Interspeech}, abstract={Traditional discrete-state HMMs are not well suited for describing steadily evolving, path-following natural processes like motion capture data or speech. HMMs cannot represent incremental progress between behaviors, and sequences sampled from the models have unnatural segment durations, unsmooth transitions, and excessive rapid variation. We propose to address these problems by permitting the state variable to occupy positions between the discrete states, and present a concrete left-right model incorporating this idea. We call this intermediate-state HMMs. The state evolution remains Markovian. We describe training using the generalized EM-algorithm and present associated update formulas. An experiment shows that the intermediate-state model is capable of gradual transitions, with more natural durations and less noise in sampled sequences compared to a conventional HMM.}, keywords={Markov models, HMMs, speech synthesis}, address={Florence, Italy}, month={Aug.}, publisher={ISCA}, volume={12}, pages={1828--1831}, doi={10.21437/Interspeech.2011-37}, year={2011} } @inproceedings{henter2012gaussian, title={Gaussian process dynamical models for nonparametric speech representation and synthesis}, author={Henter, Gustav Eje and Frean, Marcus R. and Kleijn, W. Bastiaan}, booktitle={Proc. ICASSP}, abstract={We propose Gaussian process dynamical models (GPDMs) as a new, nonparametric paradigm in acoustic models of speech. These use multidimensional, continuous state-spaces to overcome familiar issues with discrete-state, HMM-based speech models. The added dimensions allow the state to represent and describe more than just temporal structure as systematic differences in mean, rather than as mere correlations in a residual (which dynamic features or AR-HMMs do). Being based on Gaussian processes, the models avoid restrictive parametric or linearity assumptions on signal structure. We outline GPDM theory, and describe model setup and initialization schemes relevant to speech applications. Experiments demonstrate subjectively better quality of synthesized speech than from comparable HMMs. In addition, there is evidence for unsupervised discovery of salient speech structure.}, keywords={acoustic models, stochastic models, non-parametric speech synthesis, sampling}, address={Kyoto, Japan}, month={Mar.}, publisher={IEEE}, volume={37}, pages={4505--4508}, doi={10.1109/ICASSP.2012.6288919}, year={2012} } @inproceedings{petkov2012speech, title={Speech intelligibility enhancement using a statistical model of clean speech}, author={Petkov, Petko N. and Kleijn, W. Bastiaan and Henter, Gustav Eje}, booktitle={Proc. Listening Talker Workshop}, address={Edinburgh, UK}, month={May}, pages={77}, year={2012} } @inproceedings{aylett2014flexible, title={A Flexible Front-End for {HTS}}, author={Aylett, Matthew P. and Dall, Rasmus and Ghoshal, Arnab and Henter, Gustav Eje and Merritt, Thomas}, booktitle={Proc. Interspeech}, abstract={Parametric speech synthesis techniques depend on full context acoustic models generated by language front-ends, which analyse linguistic and phonetic structure. HTS, the leading parametric synthesis system, can use a number of different front-ends to generate full context models for synthesis and training. In this paper we explore the use of a new text processing front-end that has been added to the speech recognition toolkit Kaldi as part of an ongoing project to produce a new parametric speech synthesis system, Idlak. The use of XML specification files, a modular design, and modern coding and testing approaches, make the Idlak front-end ideal for adding, altering and experimenting with the contexts used in full context acoustic models. The Idlak front-end was evaluated against the standard Festival front-end in the HTS system. Results from the Idlak front-end compare well with the more mature Festival front-end (Idlak - 2.83 MOS vs Festival - 2.85 MOS), although a slight reduction in naturalness perceived by non-native English speakers can be attributed to Festival's insertion of non-punctuated pauses.}, keywords={speech synthesis, text processing, parametric synthesis, Kaldi, Idlak}, address={Singapore}, month={Sept.}, publisher={ISCA}, volume={15}, pages={1283--1287}, doi={10.21437/Interspeech.2014-320}, year={2014} } @inproceedings{henter2014measuring, title={Measuring the perceptual effects of modelling assumptions in speech synthesis using stimuli constructed from repeated natural speech}, author={Henter, Gustav Eje and Merritt, Thomas and Shannon, Matt and Mayo, Catherine and King, Simon}, booktitle={Proc. Interspeech}, abstract={Acoustic models used for statistical parametric speech synthesis typically incorporate many modelling assumptions. It is an open question to what extent these assumptions limit the naturalness of synthesised speech. To investigate this question, we recorded a speech corpus where each prompt was read aloud multiple times. By combining speech parameter trajectories extracted from different repetitions, we were able to quantify the perceptual effects of certain commonly used modelling assumptions. Subjective listening tests show that taking the source and filter parameters to be conditionally independent, or using diagonal covariance matrices, significantly limits the naturalness that can be achieved. Our experimental results also demonstrate the shortcomings of mean-based parameter generation.}, keywords={speech synthesis, acoustic modelling, stream independence, diagonal covariance matrices, repeated speech}, address={Singapore}, month={Sept.}, publisher={ISCA}, volume={15}, pages={1504--1508}, doi={10.21437/Interspeech.2014-361}, year={2014} } @inproceedings{wester2015using, title={Are we using enough listeners? {N}o! {A}n empirically-supported critique of {I}nterspeech 2014 {TTS} evaluations}, author={Wester, Mirjam and Valentini-Botinhao, Cassia and Henter, Gustav Eje}, booktitle={Proc. Interspeech}, abstract={Tallying the numbers of listeners that took part in subjective evaluations of synthetic speech at Interspeech 2014 showed that in more than 60\% of papers conclusions are based on listening tests with less than 20 listeners. Our analysis of Blizzard 2013 data shows that for a MOS test measuring naturalness a stable level of significance is only reached when more than 30 listeners are used. In this paper, we set out a list of guidelines, i.e., a checklist for carrying out meaningful subjective evaluations. We further illustrate the importance of sentence coverage and number of listeners by presenting changes to rank order and number of significant pairs by re-analysing data from the Blizzard Challenge 2013.}, keywords={subjective evaluation, text-to-speech, MOS test}, address={Dresden, Germany}, month={Sept.}, publisher={ISCA}, volume={16}, pages={3476--3480}, doi={10.21437/Interspeech.2015-689}, year={2015} }