Spaces:
Paused
Paused
| # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ | |
| This file contains functions for make ASS-format subtitle files based on the generated alignment. | |
| ASS files can be generated highlighting token-level alignments or word-level alignments. | |
| In both cases, 'segment' boundaries will be used to determine which parts of the text will appear | |
| at the same time. | |
| For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined | |
| by the NFA alignments. | |
| For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined | |
| by the NFA alignemtns. | |
| """ | |
| import math | |
| import os | |
| import soundfile as sf | |
| from utils.constants import BLANK_TOKEN, SPACE_TOKEN | |
| from utils.data_prep import Segment, Token, Word | |
| PLAYERRESX = 384 | |
| PLAYERRESY = 288 | |
| MARGINL = 10 | |
| MARGINR = 10 | |
| MARGINV = 20 | |
| def seconds_to_ass_format(seconds_float): | |
| seconds_float = float(seconds_float) | |
| mm, ss_decimals = divmod(seconds_float, 60) | |
| hh, mm = divmod(mm, 60) | |
| hh = str(round(hh)) | |
| if len(hh) == 1: | |
| hh = '0' + hh | |
| mm = str(round(mm)) | |
| if len(mm) == 1: | |
| mm = '0' + mm | |
| ss_decimals = f"{ss_decimals:.2f}" | |
| if len(ss_decimals.split(".")[0]) == 1: | |
| ss_decimals = "0" + ss_decimals | |
| srt_format_time = f"{hh}:{mm}:{ss_decimals}" | |
| return srt_format_time | |
| def rgb_list_to_hex_bgr(rgb_list): | |
| r, g, b = rgb_list | |
| return f"{b:x}{g:x}{r:x}" | |
| def make_ass_files( | |
| utt_obj, output_dir_root, ass_file_config, | |
| ): | |
| # don't try to make files if utt_obj.segments_and_tokens is empty, which will happen | |
| # in the case of the ground truth text being empty or the number of tokens being too large vs audio duration | |
| if not utt_obj.segments_and_tokens: | |
| return utt_obj | |
| if ass_file_config.resegment_text_to_fill_space: | |
| utt_obj = resegment_utt_obj(utt_obj, ass_file_config) | |
| # get duration of the utterance, so we know the final timestamp of the final set of subtitles, | |
| # which we will keep showing until the end | |
| with sf.SoundFile(utt_obj.audio_filepath) as f: | |
| audio_dur = f.frames / f.samplerate | |
| utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) | |
| utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) | |
| return utt_obj | |
| def _get_word_n_chars(word): | |
| n_chars = 0 | |
| for token in word.tokens: | |
| if token.text != BLANK_TOKEN: | |
| n_chars += len(token.text) | |
| return n_chars | |
| def _get_segment_n_chars(segment): | |
| n_chars = 0 | |
| for word_or_token in segment.words_and_tokens: | |
| if word_or_token.text == SPACE_TOKEN: | |
| n_chars += 1 | |
| elif word_or_token.text != BLANK_TOKEN: | |
| n_chars += len(word_or_token.text) | |
| return n_chars | |
| def resegment_utt_obj(utt_obj, ass_file_config): | |
| # get list of just all words and tokens | |
| all_words_and_tokens = [] | |
| for segment_or_token in utt_obj.segments_and_tokens: | |
| if type(segment_or_token) is Segment: | |
| all_words_and_tokens.extend(segment_or_token.words_and_tokens) | |
| else: | |
| all_words_and_tokens.append(segment_or_token) | |
| # figure out how many chars will fit into one 'slide' and thus should be the max | |
| # size of a segment | |
| approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( | |
| ass_file_config.fontsize * 0.6 | |
| ) # assume chars 0.6 as wide as they are tall | |
| approx_lines_per_segment = (PLAYERRESY - MARGINV) / ( | |
| ass_file_config.fontsize * 1.15 | |
| ) # assume line spacing is 1.15 | |
| if approx_lines_per_segment > ass_file_config.max_lines_per_segment: | |
| approx_lines_per_segment = ass_file_config.max_lines_per_segment | |
| max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment) | |
| new_segments_and_tokens = [] | |
| all_words_and_tokens_pointer = 0 | |
| for word_or_token in all_words_and_tokens: | |
| if type(word_or_token) is Token: | |
| new_segments_and_tokens.append(word_or_token) | |
| all_words_and_tokens_pointer += 1 | |
| else: | |
| break | |
| new_segments_and_tokens.append(Segment()) | |
| while all_words_and_tokens_pointer < len(all_words_and_tokens): | |
| word_or_token = all_words_and_tokens[all_words_and_tokens_pointer] | |
| if type(word_or_token) is Word: | |
| # if this is going to be the first word in the segment, we definitely want | |
| # to add it to the segment | |
| if not new_segments_and_tokens[-1].words_and_tokens: | |
| new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
| else: | |
| # if not the first word, check what the new length of the segment will be | |
| # if short enough - add this word to this segment; | |
| # if too long - add to a new segment | |
| this_word_n_chars = _get_word_n_chars(word_or_token) | |
| segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1]) | |
| if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment: | |
| new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
| else: | |
| new_segments_and_tokens.append(Segment()) | |
| new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
| else: # i.e. word_or_token is a token | |
| # currently this breaks the convention of tokens at the end/beginning | |
| # of segments being listed as separate tokens in segment.word_and_tokens | |
| # TODO: change code so we follow this convention | |
| new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
| all_words_and_tokens_pointer += 1 | |
| utt_obj.segments_and_tokens = new_segments_and_tokens | |
| return utt_obj | |
| def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): | |
| default_style_dict = { | |
| "Name": "Default", | |
| "Fontname": "Arial", | |
| "Fontsize": str(ass_file_config.fontsize), | |
| "PrimaryColour": "&Hffffff", | |
| "SecondaryColour": "&Hffffff", | |
| "OutlineColour": "&H0", | |
| "BackColour": "&H0", | |
| "Bold": "0", | |
| "Italic": "0", | |
| "Underline": "0", | |
| "StrikeOut": "0", | |
| "ScaleX": "100", | |
| "ScaleY": "100", | |
| "Spacing": "0", | |
| "Angle": "0", | |
| "BorderStyle": "1", | |
| "Outline": "1", | |
| "Shadow": "0", | |
| "Alignment": None, # will specify below | |
| "MarginL": str(MARGINL), | |
| "MarginR": str(MARGINR), | |
| "MarginV": str(MARGINV), | |
| "Encoding": "0", | |
| } | |
| if ass_file_config.vertical_alignment == "top": | |
| default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen | |
| elif ass_file_config.vertical_alignment == "center": | |
| default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen | |
| elif ass_file_config.vertical_alignment == "bottom": | |
| default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen | |
| else: | |
| raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") | |
| output_dir = os.path.join(output_dir_root, "ass", "words") | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") | |
| already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" | |
| being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" | |
| not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" | |
| with open(output_file, 'w') as f: | |
| default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) | |
| default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) | |
| f.write( | |
| ( | |
| "[Script Info]\n" | |
| "ScriptType: v4.00+\n" | |
| f"PlayResX: {PLAYERRESX}\n" | |
| f"PlayResY: {PLAYERRESY}\n" | |
| "\n" | |
| "[V4+ Styles]\n" | |
| f"{default_style_top_line}\n" | |
| f"{default_style_bottom_line}\n" | |
| "\n" | |
| "[Events]\n" | |
| "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" | |
| ) | |
| ) | |
| # write first set of subtitles for text before speech starts to be spoken | |
| words_in_first_segment = [] | |
| for segment_or_token in utt_obj.segments_and_tokens: | |
| if type(segment_or_token) is Segment: | |
| first_segment = segment_or_token | |
| for word_or_token in first_segment.words_and_tokens: | |
| if type(word_or_token) is Word: | |
| words_in_first_segment.append(word_or_token) | |
| break | |
| text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," | |
| + text_before_speech.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| for segment_or_token in utt_obj.segments_and_tokens: | |
| if type(segment_or_token) is Segment: | |
| segment = segment_or_token | |
| words_in_segment = [] | |
| for word_or_token in segment.words_and_tokens: | |
| if type(word_or_token) is Word: | |
| words_in_segment.append(word_or_token) | |
| for word_i, word in enumerate(words_in_segment): | |
| text_before = " ".join([x.text for x in words_in_segment[:word_i]]) | |
| if text_before != "": | |
| text_before += " " | |
| text_before = already_spoken_color_code + text_before + r"{\r}" | |
| if word_i < len(words_in_segment) - 1: | |
| text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) | |
| else: | |
| text_after = "" | |
| text_after = not_yet_spoken_color_code + text_after + r"{\r}" | |
| aligned_text = being_spoken_color_code + word.text + r"{\r}" | |
| aligned_text_off = already_spoken_color_code + word.text + r"{\r}" | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," | |
| + text_before | |
| + aligned_text | |
| + text_after.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| # add subtitles without word-highlighting for when words are not being spoken | |
| if word_i < len(words_in_segment) - 1: | |
| last_word_end = float(words_in_segment[word_i].t_end) | |
| next_word_start = float(words_in_segment[word_i + 1].t_start) | |
| if next_word_start - last_word_end > 0.001: | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,," | |
| + text_before | |
| + aligned_text_off | |
| + text_after.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| # write final set of subtitles for text after speech has been spoken | |
| words_in_final_segment = [] | |
| for segment_or_token in utt_obj.segments_and_tokens[::-1]: | |
| if type(segment_or_token) is Segment: | |
| final_segment = segment_or_token | |
| for word_or_token in final_segment.words_and_tokens: | |
| if type(word_or_token) is Word: | |
| words_in_final_segment.append(word_or_token) | |
| break | |
| text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}" | |
| # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become | |
| # longer than the original audio during the MP4 creation stage. | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," | |
| + text_after_speech.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file | |
| return utt_obj | |
| def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): | |
| default_style_dict = { | |
| "Name": "Default", | |
| "Fontname": "Arial", | |
| "Fontsize": str(ass_file_config.fontsize), | |
| "PrimaryColour": "&Hffffff", | |
| "SecondaryColour": "&Hffffff", | |
| "OutlineColour": "&H0", | |
| "BackColour": "&H0", | |
| "Bold": "0", | |
| "Italic": "0", | |
| "Underline": "0", | |
| "StrikeOut": "0", | |
| "ScaleX": "100", | |
| "ScaleY": "100", | |
| "Spacing": "0", | |
| "Angle": "0", | |
| "BorderStyle": "1", | |
| "Outline": "1", | |
| "Shadow": "0", | |
| "Alignment": None, # will specify below | |
| "MarginL": str(MARGINL), | |
| "MarginR": str(MARGINR), | |
| "MarginV": str(MARGINV), | |
| "Encoding": "0", | |
| } | |
| if ass_file_config.vertical_alignment == "top": | |
| default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen | |
| elif ass_file_config.vertical_alignment == "center": | |
| default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen | |
| elif ass_file_config.vertical_alignment == "bottom": | |
| default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen | |
| else: | |
| raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") | |
| output_dir = os.path.join(output_dir_root, "ass", "tokens") | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") | |
| already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" | |
| being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" | |
| not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" | |
| with open(output_file, 'w') as f: | |
| default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) | |
| default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) | |
| f.write( | |
| ( | |
| "[Script Info]\n" | |
| "ScriptType: v4.00+\n" | |
| f"PlayResX: {PLAYERRESX}\n" | |
| f"PlayResY: {PLAYERRESY}\n" | |
| "ScaledBorderAndShadow: yes\n" | |
| "\n" | |
| "[V4+ Styles]\n" | |
| f"{default_style_top_line}\n" | |
| f"{default_style_bottom_line}\n" | |
| "\n" | |
| "[Events]\n" | |
| "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" | |
| ) | |
| ) | |
| # write first set of subtitles for text before speech starts to be spoken | |
| tokens_in_first_segment = [] | |
| for segment_or_token in utt_obj.segments_and_tokens: | |
| if type(segment_or_token) is Segment: | |
| for word_or_token in segment_or_token.words_and_tokens: | |
| if type(word_or_token) is Token: | |
| if word_or_token.text != BLANK_TOKEN: | |
| tokens_in_first_segment.append(word_or_token) | |
| else: | |
| for token in word_or_token.tokens: | |
| if token.text != BLANK_TOKEN: | |
| tokens_in_first_segment.append(token) | |
| break | |
| for token in tokens_in_first_segment: | |
| token.text_cased = token.text_cased.replace( | |
| "▁", " " | |
| ) # replace underscores used in subword tokens with spaces | |
| token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
| text_before_speech = ( | |
| not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" | |
| ) | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," | |
| + text_before_speech.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| for segment_or_token in utt_obj.segments_and_tokens: | |
| if type(segment_or_token) is Segment: | |
| segment = segment_or_token | |
| tokens_in_segment = [] # make list of (non-blank) tokens | |
| for word_or_token in segment.words_and_tokens: | |
| if type(word_or_token) is Token: | |
| if word_or_token.text != BLANK_TOKEN: | |
| tokens_in_segment.append(word_or_token) | |
| else: | |
| for token in word_or_token.tokens: | |
| if token.text != BLANK_TOKEN: | |
| tokens_in_segment.append(token) | |
| for token in tokens_in_segment: | |
| token.text_cased = token.text_cased.replace( | |
| "▁", " " | |
| ) # replace underscores used in subword tokens with spaces | |
| token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
| for token_i, token in enumerate(tokens_in_segment): | |
| text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) | |
| text_before = already_spoken_color_code + text_before + r"{\r}" | |
| if token_i < len(tokens_in_segment) - 1: | |
| text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) | |
| else: | |
| text_after = "" | |
| text_after = not_yet_spoken_color_code + text_after + r"{\r}" | |
| aligned_text = being_spoken_color_code + token.text_cased + r"{\r}" | |
| aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}" | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,," | |
| + text_before | |
| + aligned_text | |
| + text_after.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| # add subtitles without word-highlighting for when words are not being spoken | |
| if token_i < len(tokens_in_segment) - 1: | |
| last_token_end = float(tokens_in_segment[token_i].t_end) | |
| next_token_start = float(tokens_in_segment[token_i + 1].t_start) | |
| if next_token_start - last_token_end > 0.001: | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,," | |
| + text_before | |
| + aligned_text_off | |
| + text_after.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| # Write final set of subtitles for text after speech has been spoken. | |
| # To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is. | |
| tokens_in_final_segment = [] | |
| for segment_or_token in utt_obj.segments_and_tokens[::-1]: | |
| # Collect tokens from final segment - will 'break' so we only look at the final one. | |
| if type(segment_or_token) is Segment: | |
| # 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens' | |
| for word_or_token in segment_or_token.words_and_tokens: | |
| if type(word_or_token) is Token: | |
| if word_or_token.text != BLANK_TOKEN: | |
| tokens_in_final_segment.append(word_or_token) | |
| else: | |
| # 'word_or_token' is known to be a Word, which has attribute 'tokens' | |
| for token in word_or_token.tokens: | |
| if token.text != BLANK_TOKEN: | |
| tokens_in_final_segment.append(token) | |
| break | |
| for token in tokens_in_final_segment: | |
| token.text_cased = token.text_cased.replace( | |
| "▁", " " | |
| ) # replace underscores used in subword tokens with spaces | |
| token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
| text_after_speech = ( | |
| already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}" | |
| ) | |
| # note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become | |
| # longer than the original audio during the MP4 creation stage. | |
| subtitle_text = ( | |
| f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," | |
| + text_after_speech.rstrip() | |
| ) | |
| f.write(subtitle_text + '\n') | |
| utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file | |
| return utt_obj | |