diff --git a/.gitignore b/.gitignore index c9b568f..86dcf69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ *.pyc *.swp +hlpr_dialogue_production/data/* +hlpr_speech_synthesis/src/hlpr_speech_synthesis/data/* diff --git a/.travis.yml b/.travis.yml index 4f2b6e0..0328c29 100644 --- a/.travis.yml +++ b/.travis.yml @@ -121,6 +121,7 @@ install: before_script: # source dependencies: install using wstool. - cd ~/catkin_ws/src + - git clone -b catkin git://github.com/interaction-lab/cordial-public.git - wstool init - if [[ -f $ROSINSTALL_FILE ]] ; then wstool merge $ROSINSTALL_FILE ; fi - wstool up diff --git a/hlpr_dialogue_production/CMakeLists.txt b/hlpr_dialogue_production/CMakeLists.txt new file mode 100644 index 0000000..846b9c3 --- /dev/null +++ b/hlpr_dialogue_production/CMakeLists.txt @@ -0,0 +1,194 @@ +cmake_minimum_required(VERSION 2.8.3) +project(hlpr_dialogue_production) + +## Find catkin macros and libraries +## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) +## is used, also find other catkin packages +find_package(catkin REQUIRED COMPONENTS + rospy + smach + smach_ros + control_msgs + actionlib + actionlib_msgs +) + +## System dependencies are found with CMake's conventions +# find_package(Boost REQUIRED COMPONENTS system) + + +## Uncomment this if the package has a setup.py. This macro ensures +## modules and global scripts declared therein get installed +## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html +catkin_python_setup() + +################################################ +## Declare ROS messages, services and actions ## +################################################ + +## To declare and build messages, services or actions from within this +## package, follow these steps: +## * Let MSG_DEP_SET be the set of packages whose message types you use in +## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...). +## * In the file package.xml: +## * add a build_depend tag for "message_generation" +## * add a build_depend and a run_depend tag for each package in MSG_DEP_SET +## * If MSG_DEP_SET isn't empty the following dependency has been pulled in +## but can be declared for certainty nonetheless: +## * add a run_depend tag for "message_runtime" +## * In this file (CMakeLists.txt): +## * add "message_generation" and every package in MSG_DEP_SET to +## find_package(catkin REQUIRED COMPONENTS ...) +## * add "message_runtime" and every package in MSG_DEP_SET to +## catkin_package(CATKIN_DEPENDS ...) +## * uncomment the add_*_files sections below as needed +## and list every .msg/.srv/.action file to be processed +## * uncomment the generate_messages entry below +## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...) + +## Generate messages in the 'msg' folder +# add_message_files( +# FILES +# Message1.msg +# Message2.msg +# ) + +# add_service_files( +# FILES +# LookAt.srv +# LookAtT.srv +# LookAtTS.srv +# ) + +## Generate actions in the 'action' folder + add_action_files( + FILES + DialogueAct.action + Gesture.action + Viseme.action + ) + +# Generate added messages and services with any dependencies listed here + generate_messages( + DEPENDENCIES + geometry_msgs + actionlib_msgs + std_msgs + ) + +################################################ +## Declare ROS dynamic reconfigure parameters ## +################################################ + +## To declare and build dynamic reconfigure parameters within this +## package, follow these steps: +## * In the file package.xml: +## * add a build_depend and a run_depend tag for "dynamic_reconfigure" +## * In this file (CMakeLists.txt): +## * add "dynamic_reconfigure" to +## find_package(catkin REQUIRED COMPONENTS ...) +## * uncomment the "generate_dynamic_reconfigure_options" section below +## and list every .cfg file to be processed + +## Generate dynamic reconfigure parameters in the 'cfg' folder +# generate_dynamic_reconfigure_options( +# cfg/DynReconf1.cfg +# cfg/DynReconf2.cfg +# ) + +################################### +## catkin specific configuration ## +################################### +## The catkin_package macro generates cmake config files for your package +## Declare things to be passed to dependent projects +## INCLUDE_DIRS: uncomment this if you package contains header files +## LIBRARIES: libraries you create in this project that dependent projects also need +## CATKIN_DEPENDS: catkin_packages dependent projects also need +## DEPENDS: system dependencies of this project that dependent projects also need +catkin_package( +# INCLUDE_DIRS include +# LIBRARIES hlpr_lookat +# CATKIN_DEPENDS rospy +# DEPENDS system_lib +) + +########### +## Build ## +########### + +## Specify additional locations of header files +## Your package locations should be listed before other locations +# include_directories(include) +include_directories( + ${catkin_INCLUDE_DIRS} +) + +## Declare a C++ library +# add_library(hlpr_lookat +# src/${PROJECT_NAME}/hlpr_lookat.cpp +# ) + +## Add cmake target dependencies of the library +## as an example, code may need to be generated before libraries +## either from message generation or dynamic reconfigure +# add_dependencies(hlpr_lookat ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Declare a C++ executable +# add_executable(hlpr_lookat_node src/hlpr_lookat_node.cpp) + +## Add cmake target dependencies of the executable +## same as for the library above +# add_dependencies(hlpr_lookat_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Specify libraries to link a library or executable target against +# target_link_libraries(hlpr_lookat_node +# ${catkin_LIBRARIES} +# ) + +############# +## Install ## +############# + +# all install targets should use catkin DESTINATION variables +# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html + +## Mark executable scripts (Python etc.) for installation +## in contrast to setup.py, you can choose the destination +# install(PROGRAMS +# scripts/my_python_script +# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark executables and/or libraries for installation +# install(TARGETS hlpr_lookat hlpr_lookat_node +# ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark cpp header files for installation +# install(DIRECTORY include/${PROJECT_NAME}/ +# DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} +# FILES_MATCHING PATTERN "*.h" +# PATTERN ".svn" EXCLUDE +# ) + +## Mark other files for installation (e.g. launch and bag files, etc.) +# install(FILES +# # myfile1 +# # myfile2 +# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} +# ) + +############# +## Testing ## +############# + +## Add gtest based cpp test target and link libraries +# catkin_add_gtest(${PROJECT_NAME}-test test/test_hlpr_lookat.cpp) +# if(TARGET ${PROJECT_NAME}-test) +# target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) +# endif() + +## Add folders to be run by python nosetests +# catkin_add_nosetests(test) diff --git a/hlpr_dialogue_production/LICENSE b/hlpr_dialogue_production/LICENSE new file mode 100644 index 0000000..2511acd --- /dev/null +++ b/hlpr_dialogue_production/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2017, Elaine Short +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of contingency nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/hlpr_dialogue_production/README.md b/hlpr_dialogue_production/README.md new file mode 100644 index 0000000..151b350 --- /dev/null +++ b/hlpr_dialogue_production/README.md @@ -0,0 +1,4 @@ +# Contingency Detection Package + +This package detects contingency in behaviors. + diff --git a/hlpr_dialogue_production/action/DialogueAct.action b/hlpr_dialogue_production/action/DialogueAct.action new file mode 100644 index 0000000..e4dabd5 --- /dev/null +++ b/hlpr_dialogue_production/action/DialogueAct.action @@ -0,0 +1,7 @@ +string text_or_key +string audio_file +string behavior_yaml +--- +string outcome +--- +string[] active_states \ No newline at end of file diff --git a/hlpr_dialogue_production/action/Gesture.action b/hlpr_dialogue_production/action/Gesture.action new file mode 100644 index 0000000..531da63 --- /dev/null +++ b/hlpr_dialogue_production/action/Gesture.action @@ -0,0 +1,5 @@ +string[] poses +--- +bool success +--- +string current_pose \ No newline at end of file diff --git a/hlpr_dialogue_production/action/Viseme.action b/hlpr_dialogue_production/action/Viseme.action new file mode 100644 index 0000000..aaa72b6 --- /dev/null +++ b/hlpr_dialogue_production/action/Viseme.action @@ -0,0 +1,5 @@ +string viseme +--- +bool success +--- +string current_pose \ No newline at end of file diff --git a/hlpr_dialogue_production/docs/hlpr_dialogue_production.rst b/hlpr_dialogue_production/docs/hlpr_dialogue_production.rst new file mode 100644 index 0000000..9f8d5f3 --- /dev/null +++ b/hlpr_dialogue_production/docs/hlpr_dialogue_production.rst @@ -0,0 +1,32 @@ +HLPR Dialogue Production +========================= + +Controllers +----------- +.. automodule:: controllers + :members: + +Dialogue +--------- +.. automodule:: dialogue + +.. autoclass:: Synchronizer + :members: __init__, start, reset + +.. autoclass:: ControllerState + :members: __init__, setup_sync + +.. autoclass:: TTSSpeechStart + :members: __init__ + +.. autoclass:: TTSFallbackSpeechStart + :members: __init__ + +.. autoclass:: FileSpeechStart + :members: __init__ + +.. autoclass:: NoPrepSpeechStart + :members: __init__ + +.. autoclass:: SpeechState + :members: __init__ diff --git a/hlpr_dialogue_production/launch/all_controllers.launch b/hlpr_dialogue_production/launch/all_controllers.launch new file mode 100644 index 0000000..5f0c8d7 --- /dev/null +++ b/hlpr_dialogue_production/launch/all_controllers.launch @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + diff --git a/hlpr_dialogue_production/package.xml b/hlpr_dialogue_production/package.xml new file mode 100644 index 0000000..00acf57 --- /dev/null +++ b/hlpr_dialogue_production/package.xml @@ -0,0 +1,62 @@ + + + hlpr_dialogue_production + 0.0.0 + Speech production with simultaneous behavior for HLP-R robot. + + + + + Elaine Short + + + + + + BSD + + + + + + + + + + + + + + + + + + + + + + + + + + catkin + cordial_tts + hlpr_manipulation_utils + message_generation + control_msgs + message_runtime + rospy + actionlib + actionlib_msgs + smach + smach_ros + hlpr_manipulation_utils + cordial_tts + + + + + + + + diff --git a/hlpr_dialogue_production/setup.py b/hlpr_dialogue_production/setup.py new file mode 100644 index 0000000..bac6e30 --- /dev/null +++ b/hlpr_dialogue_production/setup.py @@ -0,0 +1,12 @@ +## ! DO NOT MANUALLY INVOKE THIS setup.py, USE CATKIN INSTEAD + +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +# fetch values from package.xml +setup_args = generate_distutils_setup( + packages=['hlpr_dialogue_production'], + package_dir={'': 'src'}, +) + +setup(**setup_args) diff --git a/hlpr_dialogue_production/speech/gen_phrases.sh b/hlpr_dialogue_production/speech/gen_phrases.sh new file mode 100755 index 0000000..49ba659 --- /dev/null +++ b/hlpr_dialogue_production/speech/gen_phrases.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +mkdir audio_data +rosrun cordial_tts gen_phrases.py -o phrases.yaml -d ./audio_data -v Kendra script.txt +oggdec audio_data/*.ogg diff --git a/hlpr_dialogue_production/speech/script.txt b/hlpr_dialogue_production/speech/script.txt new file mode 100644 index 0000000..a0cef2c --- /dev/null +++ b/hlpr_dialogue_production/speech/script.txt @@ -0,0 +1,2 @@ +[test1] Hello there! How are you? +[test2] This phrase tests the timing of the actions. diff --git a/hlpr_dialogue_production/src/hlpr_dialogue_production/AWS_text2speech.py b/hlpr_dialogue_production/src/hlpr_dialogue_production/AWS_text2speech.py new file mode 100644 index 0000000..6048dc0 --- /dev/null +++ b/hlpr_dialogue_production/src/hlpr_dialogue_production/AWS_text2speech.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python +import boto3 +import pygame +from pygame import mixer +import sys +import re +import json +import tempfile +import os +from contextlib import closing +import io +from botocore.exceptions import BotoCoreError, ClientError + +class TextToSpeech(): + def __init__(self, voice='Kimberly'): + self.tag_follow_indices = {} + self.tags = [] + self.client = boto3.client('polly') + self.voice = voice + self.actions = [] + self.polly_tags = [ + 'break', # adding a pause + 'emphasis', # emphasizing words + 'lang', # specify another language for specific words + 'mark', # placing a custom tag in your text + 'p', # adding a pause between paragraphs + 'phoneme', # using phonetic pronunciation + 'prosody', # controlling volume, speaking rate and pitch + 'prosody amazon:max-duration', # setting a maximum duration for synthesized speech + 's', # adding a pause between sentences + 'say-as', # controlling how special types of words are spoken + 'speak', # identifying SSML-Enhanced text + 'sub', # pronouncing acronyms and abbreviations + 'w', # improving pronunciation by specifying parts of speech + 'amazon:auto-breaths', # adding the sound of breathing + 'amazon:effect', + #'amazon:effect name=\"drc\"', #adding dynamic range compression + #'amazon:effect phonation=\"soft\"', # speaking softly + #'amazon:effect vocal-tract-length', # controlling timbre + #'amazon:effect name=\"whispered\"' # whispering + 'amazon:breath' # breathing + ] + + + def remove_tags(self, tagged_text): + untagged_text = '' + open_brackets_idx = [m.start() for m in re.finditer('<', tagged_text)] + close_brackets_idx = [m.start() for m in re.finditer('>', tagged_text)] + assert(len(open_brackets_idx)==len(close_brackets_idx)) + + open_brackets_idx.sort() + close_brackets_idx.sort() + + pointer = 0 + for i in range(len(open_brackets_idx)): + retain = '' + tag = tagged_text[open_brackets_idx[i]:close_brackets_idx[i]+1] + + # keep the tags from amazon polly as part of untagged_text string + if any(pt for pt in self.polly_tags if ('<'+pt+' ' in tag) or ('<'+pt+'>' in tag) or ('{}".format(untagged_text) + + def extract_behaviors(self, tagged_text): + # Remove tags from the input string + untagged_text = self.remove_tags(tagged_text) + + # Store which word the tags come after + for tag in list(set(self.tags)): + tag_idxs = [m.start() for m in re.finditer(tag, tagged_text)] # find all occurances + for tag_idx in tag_idxs: + if tag_idx!=0: + substring = tagged_text[:tag_idx] + if(substring[-1]==' '): + substring = substring[:-1] + untagged_substring = self.remove_tags(substring) + prev_word_idx = substring.count(' ') + 1 + + if tag not in self.tag_follow_indices: + self.tag_follow_indices[tag] = [prev_word_idx] + else: + self.tag_follow_indices[tag].append(prev_word_idx) + else: + if tag not in self.tag_follow_indices: + self.tag_follow_indices[tag] = [0] + else: + self.tag_follow_indices[tag].append(0) + + print(self.tag_follow_indices) + for t in self.tag_follow_indices: + for idx in self.tag_follow_indices[t]: + args = t.strip("<>").split() + act = args.pop(0) + self.actions.append([idx, act, args]) + + + # Fetch meta info about speech from AWS using boto3 + try: + response = self.client.synthesize_speech( + OutputFormat='json', + SpeechMarkTypes=['viseme','word'], + Text=untagged_text, + TextType="ssml", + VoiceId=self.voice) + except (BotoCoreError, ClientError) as error: + print(error) + response = "" + + # Unpack meta info json to an unsorted list of dictionaries + output_data = [] + if "AudioStream" in response: + with closing(response["AudioStream"]) as stream: + data = stream.read() + + s = data.split('\n') + s = [json.loads(line) for line in s if line != ''] + else: + print("Could not stream audio") + return untagged_text, [] + + word_times = filter(lambda l: l["type"]=="word", s) # Start edits + for a in self.actions: + if a[0] > len(word_times)-1: + a[0] = s[-1]["time"] / 1000. # convert ms to seconds + else: + a[0] = (word_times[a[0]]["time"]) / 1000. # convert ms to seconds + + data=[] + for a in self.actions: + args = a[2] + data.append({"start":float(a[0]), + "type":a[1], + "args":args, + "id": a[1]}) # End edits + + + visemes = map(lambda l: [l["time"],l["value"]], filter(lambda l: l["type"]=="viseme",s)) + for v in visemes: + data.append({"start":float(v[0]) / 1000., # convert ms to seconds + "type":"viseme", + "args":"", + "id": v[1]}) + + return untagged_text, data + + + def phrase_to_file(self, name, tagged_text, output_dir): + + # Remove tags from the input string + untagged_text, s = obj.extract_behaviors(tagged_text) + + # Fetch audio for speech from AWS using boto3 + spoken_text = self.client.synthesize_speech( + OutputFormat='mp3', + Text=untagged_text, + TextType="ssml", + VoiceId=self.voice) + + # get absolute dir path + output_dir_path = os.path.abspath(os.path.expanduser(output_dir)) + + b = { + 'text': '"'+untagged_text+'"', + 'file': output_dir_path + '/' + name + '.ogg', + 'behaviors': s + } + + # Write audio to a file + with open(b['file'],'wb') as f: + f.write(spoken_text['AudioStream'].read()) + f.close() + + return b + + def say(self, untagged_text, wait=False, interrupt=True): + + try: + response = self.client.synthesize_speech( + OutputFormat='ogg_vorbis', + Text=untagged_text, + TextType="ssml", + VoiceId=self.voice) + except: + print "Error synthesizing speech" + return 0 + + with tempfile.TemporaryFile() as f: + if "AudioStream" in response: + with closing(response["AudioStream"]) as stream: + try: + f.write(stream.read()) + except IOError as error: + print(error) + + + else: + print("Could not stream audio") + return + + f.seek(0) + + if not pygame.mixer.get_init(): + pygame.mixer.init() + else: + if interrupt: + pygame.mixer.stop() + channel = pygame.mixer.Channel(5) + channel.set_volume(1.0) + sound = pygame.mixer.Sound(f) + channel.play(sound) + + if wait: + while channel.get_busy(): + pass + return -1 + + + return sound.get_length() + + def is_speaking(self): + # Returns whether or not audio is being played + return not pygame.mixer.get_init() or pygame.mixer.Channel(5).get_busy() + + def shutup(self): + if pygame.mixer.get_init(): + pygame.mixer.stop() + + +if __name__ == '__main__': + tagged_string = "Hello ! How are you? hello again! " + print(tagged_string) + obj = TextToSpeech(voice='Kimberly') + b = obj.phrase_to_file('out',tagged_string, '../../data/') + obj.say(b['text'], wait=True, interrupt=False) diff --git a/hlpr_dialogue_production/src/hlpr_dialogue_production/__init__.py b/hlpr_dialogue_production/src/hlpr_dialogue_production/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hlpr_dialogue_production/src/hlpr_dialogue_production/controllers.py b/hlpr_dialogue_production/src/hlpr_dialogue_production/controllers.py new file mode 100644 index 0000000..2e1501b --- /dev/null +++ b/hlpr_dialogue_production/src/hlpr_dialogue_production/controllers.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python + +# Copyright (c) 2017, Elaine Short, SIM Lab +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the SIM Lab nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +"""Defines functions to create behavior controllers for dialogue production. + +This module contains functions that create controllers for each of the +behaviors that one might want to synchronize with robot speech. Currently, +this includes looking at a point in space, executing a gesture with pre- +defined waypoints, and a test controller that just plays a beeping sound (and +can be used to check synchronization). +""" + + +import rospy +import smach +import smach_ros +import actionlib +import random +import os + +from geometry_msgs.msg import TransformStamped, Transform, Vector3, Quaternion +from std_msgs.msg import Header +import hlpr_dialogue_production.msg as dialogue_msgs +import hlpr_record_demonstration.msg as record_msgs +import hlpr_lookat.msg as lookat_msgs +import hlpr_lookat.msg + + +from hlpr_dialogue_production.dialogue import ControllerState +import actionlib_tutorials.msg + + +def lookat_controller_cb(behavior_name, string_args): + """Callback to create a LookatAction goal from string arguments + + Given a string of arguments (pulled from the behavior tags in a speech + string), determines whether the robot should look at the base of the + frame or a point relative to the frame. Prints a warning message to the + screen and returns None if the number of arguments is incorrect or they + are of the wrong type, but does not check that the first argument is a + valid frame (if it is not, the LookatWaypoints action server will return + the error instead). + + Parameters + ---------- + behavior_name : str + The name of the behavior being handled (right now, this will always + be "lookat") + string_args : list of str + The arguments that were parsed from the speech string, as strings + + Returns + ------- + LookatWaypointsGoal + Goal containing the one point to look at. + + """ + if len(string_args)==1: #have just a frame name + vec3=Vector3(0.0,0.0,0.0) + elif len(string_args)==4: + try: + point = map(lambda s: float(s), string_args[1:]) + except ValueError: + rospy.logerr("Invalid arguments to lookat: {}".format(string_args)) + return None + vec3=Vector3(point[0],point[1],point[2]) + else: + rospy.logerr("Invalid arguments to lookat: {}".format(string_args)) + return None + + frame = string_args[0] + + pos = TransformStamped() + pos.child_frame_id = frame + pos.transform = Transform() + pos.transform.translation = vec3 + pos.header = Header() + waypoints = [pos] + waytimes = [rospy.Duration(0.0)] + return hlpr_lookat.msg.LookatWaypointsGoal(waypoints,waytimes) + +def get_lookat_controller(): + """ Sets up the lookat controller state + + Sets up the lookat controller state to connect to the lookat_waypoints + action server in hlpr_lookat. Does not adjust the timing of lookat + behaviors. + + """ + behaviors=["lookat"] + time_adj = None + return ControllerState("LOOKAT_CONTROLLER",behaviors, + "/lookat_waypoints_action_server", + hlpr_lookat.msg.LookatWaypointsAction, + lookat_controller_cb,time_adj) + + + +def keyframe_playback_controller_cb(behavior_name, string_args): + """Callback to create a PlaybackKeyframeDemoGoal from string arguments + + Given the name of a trajectory, looks up the file associated with the + name and packs it into a goal for the hlpr_kinesthetic_teaching playback + server. + + Parameters + ---------- + behavior_name : str + The name of the behavior being handled + string_args : list of str + The arguments that were parsed from the speech string, as strings + (not used) + + Returns + ------- + PlaybackKeyframeDemoGoal + Goal containing the location of the pickle file to play back + + """ + rospy.loginfo("Got keyframe controller callback") + + pickle_locations={"turn_cup":"/home/eshort/robot_movements/rot_90_deg_joint.pkl", + "wave":"/home/eshort/robot_movements/rot_90_deg_joint.pkl"} + filename = "" + + home_path = os.path.expanduser("~") + "/robot_movements/" + string_args[0] + ".pkl" + + if string_args[0] in pickle_locations: + filename = pickle_locations[string_args[0]] + elif os.path.exists(string_args[0]): + filename = string_args[0] + elif os.path.exists(home_path): + filename = home_path + else: + pass + + if filename == "": + rospy.logerr("could not find file for the behavior specified as '" + string_args[0] + "'") + rospy.logerr("I looked in the following locations:") + looked_in = pickle_locations.values() + [string_args[0], home_path] + rospy.logerr(str(looked_in)) + else: + rospy.loginfo("loading behavior from file " + filename) + return record_msgs.PlaybackKeyframeDemoGoal(bag_file_name=filename) + +def get_keyframe_playback_controller(): + """ Sets up the keyframe playback controller state + + Sets up the keyframe playback controller to connect to the + playback_demonstration_action_server action server in hlpr_record_demonstration. + + """ + behaviors=["keyframe"] + time_adj = None + return ControllerState("KEYFRAME_PLAYBACK_CONTROLLER",behaviors, + "/playback_keyframe_demo", + record_msgs.PlaybackKeyframeDemoAction, + keyframe_playback_controller_cb,time_adj) + +def gesture_controller_cb(behavior_name, string_args): + """Callback to create a GestureGoal from string arguments + + Given a string of arguments (pulled from the behavior tags in a speech + string), selects the series of named keypoints that define the geature. + These keypoints are currently defined in the Gesture Action Server (in + ``gesture_action_server.py``. To add more keypoints, see the documentation + for that file. + + Parameters + ---------- + behavior_name : str + The name of the behavior being handled + string_args : list of str + The arguments that were parsed from the speech string, as strings + + Returns + ------- + GestureGoal + Gesture goal containing the list of keypoints for the gesture + + """ + + gesture_poses = {"wave":["wave1","wave2","wave1","wave2"], + "shrug":["right_home","shrug","right_home"], + "thinking":["right_home","hmm","right_home"]} + return dialogue_msgs.GestureGoal(poses=gesture_poses[behavior_name]) + +def get_gesture_controller(): + """ Sets up the gesture controller state + + Sets up the gesture controller state to connect to the gesture action + server in this package. Adjusts the timing of the gesture behaviors + by 1s; you may want to change this for your application. + + """ + behaviors=["wave", "shrug", "thinking"] + time_adj = {"wave":1.0, + "shrug":1.0, + "thinking":1.0} + + return ControllerState("GESTURE_CONTROLLER", behaviors, "/HLPR_Gesture", + dialogue_msgs.GestureAction, + gesture_controller_cb, time_adj) + +def viseme_controller_cb(behavior_name, string_args): + """Callback to create a VisemeGoal from string arguments + + Given a string of arguments (pulled from the behavior tags in a speech + string), selects the series of named keypoints that define the geature. + These keypoints are currently defined in the Gesture Action Server (in + ``gesture_action_server.py``. To add more keypoints, see the documentation + for that file. + + Parameters + ---------- + behavior_name : str + The name of the behavior being handled + string_args : list of str + The arguments that were parsed from the speech string, as strings + + Returns + ------- + VisemeGoal + Viseme goal containing the name of the viseme + + """ + return dialogue_msgs.VisemeGoal(viseme=behavior_name) + +def get_viseme_controller(): + """ Sets up the viseme controller state + + Sets up the viseme controller state to connect to the viseme action + server in this package. + + """ + + + time_adj = {"viseme":0.0} + + behaviors=time_adj.keys() + + return ControllerState("VISEME_CONTROLLER", behaviors, "/HLPR_Visemes", + dialogue_msgs.VisemeAction, + viseme_controller_cb, time_adj) + + +def test_controller_cb(behavior_name, string_args): + """Callback to create a FibonacciGoal from string arguments + + The test action server just uses the Fibonacci goal from the + actionlib_tutorials package. The contents of the goal are not used. + + Parameters + ---------- + behavior_name : str + The name of the behavior being handled (right now, this will always + be "lookat") + string_args : list of str + The arguments that were parsed from the speech string, as strings + + Returns + ------- + FibonacciGoal + Fibonacci goal with order=0; contents not used. + + """ + return actionlib_tutorials.msg.FibonacciGoal(order=0) + +def get_test_controller(): + """ Sets up the test controller state + + Sets up the test controller state to connect to the test action + server in this package. Does not adjust the timing of behaviors. + + """ + behaviors=["test"] + time_adj = {"test":0.0} + return ControllerState("TEST_CONTROLLER",behaviors,"/test_beep_controller", + actionlib_tutorials.msg.FibonacciAction, + test_controller_cb, time_adj) diff --git a/hlpr_dialogue_production/src/hlpr_dialogue_production/dialogue.py b/hlpr_dialogue_production/src/hlpr_dialogue_production/dialogue.py new file mode 100644 index 0000000..cc2867e --- /dev/null +++ b/hlpr_dialogue_production/src/hlpr_dialogue_production/dialogue.py @@ -0,0 +1,901 @@ +#!/usr/bin/env python + +# Copyright (c) 2017, Elaine Short, SIM Lab +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the SIM Lab nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import rospy +import smach +import smach_ros +import wave +import yaml +import threading +import actionlib +import contextlib +from sound_play.libsoundplay import SoundClient +from AWS_text2speech import TextToSpeech + +class Synchronizer(): + """ Object that can be passed to all controllers to synchronize their start + + A single Synchronizer object is passed to all controllers so that the + speech state can signal when the audio has begun to play. This is needed + because online text-to-speech has to download the audio from the internet + before it can begin to play, leading to a variable start time for the + speech audio. + """ + + def __init__(self): + """ Creates a Synchronizer object""" + self.started = False + self.time = None + + def start(self): + """ Sets the state of the Synchronizer to started and saves the time""" + self.started = True + self.time = rospy.Time.now() + + def reset(self): + """ Resets the state of the Synchronizer to not started""" + self.started = False + self.time = None + +class ControllerState(smach.State): + """ Interface to action servers controlling robot behavior + + A ControllerState is a smach.State that sends requests to a specific + action server at the appropriate times. + + .. warning:: After creating the object, you *must* call ``setup_sync`` to provide the state with a Synchronizer. The Synchronizer is not included in the constructor so that the object can be created in ``controllers.py`` or elsewhere in code and passed to the SmachWrapper state, which will then set up the Synchronizers and assemble the dialogue act state machine. + + + **Input keys** + * ordered_behaviors : list of dict + An ordered list of all the behaviors the controller should play. A + behavior is a dict with keys "start", "id", and "args", indicating + the start time (after speech begins), name of the behavior, and any + arguments to the behavior, respectively. + + **Outcomes** + * preempted + The behavior was preempted before playing all the behaviors in the list + to completion. + * done + The controller played all of the behaviors in the list. + + """ + def __init__(self, name, behaviors, topic, action_type, arg_list_to_goal_cb, behavior_time_adj=None): + """ Constructor + + Creates the ControllerState with the given name, able to handle the + given behaviors by sending the arguments to the arg_list_to_goal_cb + with a time adjustment factor of behavior_time_adj (assigned on a per- + behavior basis). By convention, the name should be in all caps, and + must not conflict with any other names in the state machine containing + this state. If using the SmachWrapper, the names "SPEECH", "START", + "DIALOGUE" are taken. + + + Parameters + ---------- + name : str + The name of this state in the state machine + + behaviors : list of str + The behaviors (from the speech string tags) that this controller + can handle + + topic : str + The topic for the action server that this controller calls + + action_type : ROS Action Type + The type for the action server that this controller calls + + arg_list_to_goal_cb : function + A function taking the name of the behavior and a list of arguments + and returning a goal for the action of type ``action_type`` + + behavior_time_adj : dict + A mapping from behavior names (from ``behaviors``) to times (in s). + The controller will send the action goal that much earlier. + Negative values will cause the controller to call the action + server later. + + """ + + smach.State.__init__(self,outcomes=["preempted","done"], + input_keys=["ordered_behaviors"]) + self.name = name + self._can_handle = behaviors + self._cb = arg_list_to_goal_cb + self._client = actionlib.SimpleActionClient(topic,action_type) + self._sync = None + if behavior_time_adj!=None: + self._time_adj = behavior_time_adj + else: + self._time_adj = {} + + def setup_sync(self,synchronizer): + """ Provides the controller with a Synchronizer object + + This provides the controller with information about when the speech + audio has begun. This will only work if the controller and speech + state share the same Synchronizer object. + + Parameters + ---------- + synchronizer : Synchronizer + A synchronizer object. All controllers, including the speech state, + should share the same synchronizer. + + """ + self._sync=synchronizer + + def execute(self,userdata): + if userdata.ordered_behaviors==None: + rospy.logwarn("Controller {} got empty behavior list".format(self.name)) + return "done" + + ordered_behaviors = filter(lambda b: b["type"] in self._can_handle, + userdata.ordered_behaviors) + + while not (self._sync!=None and self._sync.started) and not self.preempt_requested(): + rospy.sleep(0.001) + + if self.preempt_requested(): + self._client.cancel_all_goals() + self.service_preempt() + return "preempted" + + goal_sent=False + for b in ordered_behaviors: + if b["id"] in self._time_adj: + start = b["start"]-self._time_adj[b["id"]] + else: + start = b["start"] + + goal = self._cb(b["id"],b["args"]) + if goal==None: + continue + while not self.preempt_requested() and rospy.Time.now()-self._sync.time < rospy.Duration.from_sec(start): + rospy.sleep(0.001) + + if self.preempt_requested(): + self._client.cancel_all_goals() + self.service_preempt() + return "preempted" + + self._client.send_goal(goal) + goal_sent = True + if goal_sent: + while not self._client.wait_for_result(rospy.Duration(0.05)): + if self.preempt_requested(): + self._client.cancel_all_goals() + self.service_preempt() + return "preempted" + return "done" + +class TTSSpeechStart(smach.State): + """ Speech prep state when using TTS + + State to process marked text to ordered behaviors, unmarked text, and an + empty wav_file, that can be passed on to the Speech state. + + + **Input keys** + * marked_text : str + Text marked up with . Behaviors will be synced to + the word following the tag. + + **Output keys** + * text : str + Text marked up with . Behaviors will be synced to + the word following the tag. + * ordered_behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". Ordered by start time. + * wav_file : str + Always None + + **Outcomes** + * done + Finished fetching behaviors + + """ + def __init__(self,voice="Kimberly"): + """ Constructor + + Initializes TTS for speech using Amazon Polly with the given voice + + Parameters + ----------- + voice : str, optional + Which Amazon Polly voice to use. Defaults to Kimberly + """ + + smach.State.__init__(self,outcomes=["done"], + input_keys=["marked_text"], + output_keys=["text","ordered_behaviors","wav_file"]) + + self._tts = TextToSpeech(voice) + + def execute(self,userdata): + text, behaviors = self._tts.extract_behaviors(userdata.marked_text) + wav_file = None + userdata.ordered_behaviors = sorted(behaviors,key = lambda b: b["start"]) + userdata.text = text + userdata.wav_file = wav_file + return "done" + +class TTSFallbackSpeechStart(smach.State): + """ Speech prep from file with online TTS fallback + + State to read text, wave file, and ordered behaviors from a phrase file, + given a key into that file. If the key isn't found, uses online TTS using + Amazon Polly. This is useful for debugging or if there are a small number + of phrases that can't be known in advance. + + **Input keys** + * key_or_marked_text : str + Either a key into the phrase file or text marked up with + . The state tries to read from the phrase file + with this string. If it's not found, assume it is marked up text + and generate the audio with TTS + + **Output keys** + * text : str + Text marked up with . Behaviors will be synced to + the word following the tag. + * ordered_behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". Ordered by start time. + * wav_file : str + None if the audio needs to be fetched, otherwise the path to the + audio file. + + **Outcomes** + * done + Finished fetching behaviors + + """ + def __init__(self, phrases, voice="Kimberly"): + """ Constructor + + Initializes TTS for speech using Amazon Polly with the given voice, and + reads in the phrase file. + + Parameters + ----------- + voice : str, optional + Which Amazon Polly voice to use. Defaults to Kimberly + + phrases : str + Path to the phrase file to use. + """ + smach.State.__init__(self,outcomes=["done"], + input_keys=["key_or_marked_text"], + output_keys=["text","ordered_behaviors","wav_file"]) + + self._tts = TextToSpeech(voice) + self._phrases = phrases + + + def execute(self,userdata): + if userdata.key_or_marked_text in self._phrases: + phrase = self._phrases[userdata.key_or_marked_text] + if "text" in phrase: + text = phrase["text"] + else: + text = None + wav_file = phrase["file"] + behaviors = phrase["behaviors"] + else: + text, behaviors = self._tts.extract_behaviors(userdata.key_or_marked_text) + wav_file = None + + userdata.ordered_behaviors = sorted(behaviors,key = lambda b: b["start"]) + userdata.text = text + userdata.wav_file = wav_file + return "done" + +class DebugSpeechStart(smach.State): + """ Debugging speech state with no audio + + Prints the values of the provided userdata, allowing for debugging. Doesn't + actually play any sound. + + **Input keys** + * behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". + * wav_file_loc : str + The path to the audio file to play + * key_or_marked_text : str + Either a key into the phrase file or text marked up with + . + + **Output keys** + * text : str + Text marked up with . Behaviors will be synced to + the word following the tag. + * ordered_behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". Ordered by start time. + * wav_file : str + None if the audio needs to be fetched, otherwise the path to the + audio file. + + """ + + def __init__(self): + smach.State.__init__(self,outcomes=["done"], + input_keys=["key_or_marked_text", "behaviors", + "wav_file_loc"], + output_keys=["text","ordered_behaviors","wav_file"]) + + def execute(self,userdata): + if userdata.behaviors != None: + userdata.ordered_behaviors = sorted(userdata.behaviors,key = lambda b: b["start"]) + else: + userdata.ordered_behaviors = userdata.behaviors + userdata.text = userdata.key_or_marked_text + userdata.wav_file = userdata.wav_file_loc + + + rospy.logwarn("Speech debug info:") + rospy.logwarn("Ordered Behaviors: {}".format(userdata.behaviors)) + rospy.logwarn("Text: {}".format(userdata.key_or_marked_text)) + rospy.logwarn("Wave file: {}".format(userdata.wav_file_loc)) + + return "done" + + + +class FileSpeechStart(smach.State): + """ Speech prep from file + + State to read text, wave file, and ordered behaviors from a phrase file, + given a key into that file. + + **Input keys** + * key : str + A key into the phrase file + + **Output keys** + * text : str + If the text is included in the phrase file, the text, otherwise + None + * ordered_behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". Ordered by start time. + * wav_file : str + The path to the audio file. + + **Outcomes** + * done + Finished fetching behaviors + * not_found + Unable to find the key in the phrase file + + """ + def __init__(self, phrases): + """ Constructor + + Initializes TTS for speech using a phrase file. + + Parameters + ----------- + + phrases : str + Path to the phrase file to use. + """ + smach.State.__init__(self,outcomes=["done","not_found"], + input_keys=["key"], + output_keys=["text","ordered_behaviors","wav_file"]) + self._phrases = phrases + + def execute(self,userdata): + if userdata.key not in self._phrases: + return "not_found" + phrase = self._phrases[userdata.key] + if "text" in phrase: + text = phrase["text"] + else: + text = None + wav_file = phrase["file"] + behaviors = phrase["behaviors"] + + userdata.ordered_behaviors = sorted(behaviors,key = lambda b: b["start"]) + userdata.text = text + userdata.wav_file = wav_file + return "done" + +class NoPrepSpeechStart(smach.State): + """ Pass through speech info without prep + + State for the case where the dialogue state machine will be given text, + behaviors (not necessarily ordered), and a path to a wave file in + the userdata. + + **Input keys** + * behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". + * wav_file_loc : str + The path to the audio file to play + + **Output keys** + * text : str + Always None + * ordered_behaviors : list of dict + A list of behavior dictionaries. A behavior has the keys "id", + "start", and "args". Ordered by start time. + * wav_file : str + The path to the audio file. + + **Outcomes** + * done + Finished fetching behaviors + * missing_info + Missing information in the input keys + + """ + def __init__(self): + """ Constructor + + Initializes passthrough state for speech features. + """ + smach.State.__init__(self,outcomes=["done","missing_info"], + input_keys=["behaviors","wav_file_loc"], + output_keys=["text","ordered_behaviors","wav_file"]) + + def execute(self,userdata): + if userdata.behaviors==None: + return "missing_info" + if userdata.wav_file==None: + return "missing_info" + behaviors = userdata.behaviors + text = None + wav_file = userdata.wav_file + + userdata.ordered_behaviors = sorted(behaviors,key = lambda b: b["start"]) + userdata.text = text + userdata.wav_file = wav_file + + return "done" + +class SpeechDebugState(smach.State): + """ Speech debug state + + Doesn't do anything, allowing for debugging of speech input without + requiring tts or audio files. + + **Input keys** + * text : str + Can be None; the text version of the robot's speech. Used to print + to the screen. + * wav_file : str + Can be None if TTS is on. The path to the audio file. + + **Outcomes** + * done + Finished fetching behaviors. Always returns this. + * no_audio + Couldn't find the wave file, or no wave file provided and TTS + turned off. Never returns this. + * preempted + State was preempted before audio finished playing. If preempted, + will try to stop the audio. Never returns this. + + + """ + def __init__(self, synchronizer): + + smach.State.__init__(self,outcomes=["preempted","no_audio","done"], + input_keys=["text","wav_file"]) + self._sync = synchronizer + + def execute(self,userdata): + return "done" + +class SpeechState(smach.State): + """ Speech player state + + Takes in text and/or a wave file. Given a wave file, plays the file. + If no wave file is provided and text-to-speech is on, fetch the audio from + Amazon Polly and play the audio. If no wave file is provided and text-to- + speech is off, return "no_audio" + + **Input keys** + * text : str + Can be None; the text version of the robot's speech. Used to print + to the screen. + * wav_file : str + Can be None if TTS is on. The path to the audio file. + + **Outcomes** + * done + Finished fetching behaviors + * no_audio + Couldn't find the wave file, or no wave file provided and TTS + turned off + * preempted + State was preempted before audio finished playing. If preempted, + will try to stop the audio. + + """ + def __init__(self, use_tts, synchronizer, phrases=None, voice="Kimberly"): + """ Constructor + + Initializes the speech state with the desired parameters; either with + or without online TTS, and with or without a pre-generated phrase + file. + + Parameters + ----------- + use_tts : bool + If true, allow online TTS + synchronizer : Synchronizer + Synchronizer object to allow sync of speech and behaviors. Should + be the same object as is passed to behavior ControllerState objects + with ``setup_sync`` + voice : str, optional + Which Amazon Polly voice to use. Defaults to Kimberly + + phrases : str, optional + Path to the phrase file to use. If None, require online TTS. + """ + smach.State.__init__(self,outcomes=["preempted","no_audio","done"], + input_keys=["text","wav_file"]) + self._tts = use_tts + if use_tts: + + self._talker = TextToSpeech(voice) + + self._sound_client = SoundClient() + self._sync = synchronizer + + def execute(self,userdata): + rospy.loginfo("Saying: {}".format(userdata.text)) + if userdata.wav_file==None: + if userdata.text!=None and len(userdata.text)!=0: + speech_duration = self._talker.say(userdata.text,wait=False) + rospy.sleep(0.5) + self._sync.start() + else: + return "no_audio" + else: + wav_file = userdata.wav_file + with contextlib.closing(wave.open(wav_file,'r')) as f: + frames=f.getnframes() + rate=f.getframerate() + speech_duration=frames/float(rate) + self._sound_client.playWave(wav_file) + self._sync.start() + + while rospy.Time.now()-self._sync.time',tagged_text) + + def remove_tags(self, tagged_text): + untagged_text = '' + open_brackets_idx = [m.start() for m in re.finditer('<', tagged_text)] + close_brackets_idx = [m.start() for m in re.finditer('>', tagged_text)] + assert(len(open_brackets_idx)==len(close_brackets_idx)) + + open_brackets_idx.sort() + close_brackets_idx.sort() + + pointer = 0 + for i in range(len(open_brackets_idx)): + if(open_brackets_idx[i]!=0): + untagged_text = untagged_text + tagged_text[pointer:open_brackets_idx[i]-1] + pointer = close_brackets_idx[i]+1 + if(pointer!=len(tagged_text)): + untagged_text = untagged_text + tagged_text[pointer:] + + return untagged_text + + def extract_behaviors(self, tagged_text): + # Remove tags from the input string + untagged_text = self.remove_tags(tagged_text) + untagged_word_list = untagged_text.split() + self.collect_tags(tagged_text) + + # Store which word the tags come after + for tag in list(set(self.tags)): + tag_idxs = [m.start() for m in re.finditer(tag, tagged_text)] # find all occurances + for tag_idx in tag_idxs: + if tag_idx!=0: + substring = tagged_text[:tag_idx] + if(substring[-1]==' '): + substring = substring[:-1] + untagged_substring = self.remove_tags(substring) + prev_word_idx = untagged_substring.count(' ') + 1 + + if tag not in self.tag_follow_indices: + self.tag_follow_indices[tag] = [prev_word_idx] + else: + self.tag_follow_indices[tag].append(prev_word_idx) + else: + if tag not in self.tag_follow_indices: + self.tag_follow_indices[tag] = [0] + else: + self.tag_follow_indices[tag].append(0) + + # Fetch meta info about speech from AWS using boto3 + response = self.client.synthesize_speech( + OutputFormat='json', + SpeechMarkTypes=['viseme','word'], + Text=untagged_text, + VoiceId=self.voice) + + # Unpack meta info json to an unsorted list of dictionaries + s = [] + if "AudioStream" in response: + with closing(response["AudioStream"]) as stream: + data = stream.read() + + s = data.split('\n') + s = [json.loads(line) for line in s if line != ''] + else: + print("Could not stream audio") + + return untagged_text, s + + def phrase_to_file(self, tagged_text, output_file_loc): + + # Remove tags from the input string + untagged_text, s = obj.extract_behaviors(tagged_text) + + # Fetch audio for speech from AWS using boto3 + spoken_text = self.client.synthesize_speech( + OutputFormat='mp3', + Text=untagged_text, + VoiceId=self.voice) + + # Write audio to a file + with open(output_file_loc,'wb') as f: + f.write(spoken_text['AudioStream'].read()) + f.close() + + b = { + 'text': untagged_text, + 'file': os.getcwd() + '/' + output_file_loc, + 'behaviors': s + } + + return b + + def say(self, untagged_text, wait=False, interrupt=True): + + response = self.client.synthesize_speech( + OutputFormat='ogg_vorbis', + Text=untagged_text, + VoiceId=self.voice) + + with tempfile.TemporaryFile() as f: + if "AudioStream" in response: + with closing(response["AudioStream"]) as stream: + try: + f.write(stream.read()) + except IOError as error: + print(error) + sys.exit(-1) + + else: + print("Could not stream audio") + sys.exit(-1) + + f.seek(0) + + if not pygame.mixer.get_init(): + pygame.mixer.init() + else: + if interrupt: + pygame.mixer.stop() + channel = pygame.mixer.Channel(5) + channel.set_volume(1.0) + sound = pygame.mixer.Sound(f) + channel.play(sound) + + if wait: + while channel.get_busy(): + pass + return -1 + + + return sound.get_length() + + def is_speaking(self): + # Returns whether or not audio is being played + return mixer.music.get_busy() + +if __name__ == '__main__': + tagged_string = "Hello ! How are you?" + print(tagged_string) + obj = TextToSpeech(voice='Kimberly') + b = obj.phrase_to_file(tagged_string, 'data/out.mp3') + obj.say(b['text'], wait=True, interrupt=False) \ No newline at end of file