NeMo-Forced-Aligner

Paused

App Files Files Community

NeMo-Forced-Aligner / utils /make_ass_files.py

erastorgueva-nv

get latest NFA which should ensure subtitles show until end of video

abb41a8 over 2 years ago

raw

history blame contribute delete

22.7 kB

	# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	This file contains functions for make ASS-format subtitle files based on the generated alignment.
	ASS files can be generated highlighting token-level alignments or word-level alignments.
	In both cases, 'segment' boundaries will be used to determine which parts of the text will appear
	at the same time.
	For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined
	by the NFA alignments.
	For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined
	by the NFA alignemtns.
	"""

	import math
	import os
	import soundfile as sf

	from utils.constants import BLANK_TOKEN, SPACE_TOKEN
	from utils.data_prep import Segment, Token, Word

	PLAYERRESX = 384
	PLAYERRESY = 288
	MARGINL = 10
	MARGINR = 10
	MARGINV = 20


	def seconds_to_ass_format(seconds_float):
	seconds_float = float(seconds_float)
	mm, ss_decimals = divmod(seconds_float, 60)
	hh, mm = divmod(mm, 60)

	hh = str(round(hh))
	if len(hh) == 1:
	hh = '0' + hh

	mm = str(round(mm))
	if len(mm) == 1:
	mm = '0' + mm

	ss_decimals = f"{ss_decimals:.2f}"
	if len(ss_decimals.split(".")[0]) == 1:
	ss_decimals = "0" + ss_decimals

	srt_format_time = f"{hh}:{mm}:{ss_decimals}"

	return srt_format_time


	def rgb_list_to_hex_bgr(rgb_list):
	r, g, b = rgb_list
	return f"{b:x}{g:x}{r:x}"


	def make_ass_files(
	utt_obj, output_dir_root, ass_file_config,
	):

	# don't try to make files if utt_obj.segments_and_tokens is empty, which will happen
	# in the case of the ground truth text being empty or the number of tokens being too large vs audio duration
	if not utt_obj.segments_and_tokens:
	return utt_obj

	if ass_file_config.resegment_text_to_fill_space:
	utt_obj = resegment_utt_obj(utt_obj, ass_file_config)

	# get duration of the utterance, so we know the final timestamp of the final set of subtitles,
	# which we will keep showing until the end
	with sf.SoundFile(utt_obj.audio_filepath) as f:
	audio_dur = f.frames / f.samplerate

	utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
	utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)

	return utt_obj


	def _get_word_n_chars(word):
	n_chars = 0
	for token in word.tokens:
	if token.text != BLANK_TOKEN:
	n_chars += len(token.text)
	return n_chars


	def _get_segment_n_chars(segment):
	n_chars = 0
	for word_or_token in segment.words_and_tokens:
	if word_or_token.text == SPACE_TOKEN:
	n_chars += 1
	elif word_or_token.text != BLANK_TOKEN:
	n_chars += len(word_or_token.text)
	return n_chars


	def resegment_utt_obj(utt_obj, ass_file_config):

	# get list of just all words and tokens
	all_words_and_tokens = []
	for segment_or_token in utt_obj.segments_and_tokens:
	if type(segment_or_token) is Segment:
	all_words_and_tokens.extend(segment_or_token.words_and_tokens)
	else:
	all_words_and_tokens.append(segment_or_token)

	# figure out how many chars will fit into one 'slide' and thus should be the max
	# size of a segment
	approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / (
	ass_file_config.fontsize * 0.6
	) # assume chars 0.6 as wide as they are tall
	approx_lines_per_segment = (PLAYERRESY - MARGINV) / (
	ass_file_config.fontsize * 1.15
	) # assume line spacing is 1.15
	if approx_lines_per_segment > ass_file_config.max_lines_per_segment:
	approx_lines_per_segment = ass_file_config.max_lines_per_segment

	max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment)

	new_segments_and_tokens = []
	all_words_and_tokens_pointer = 0
	for word_or_token in all_words_and_tokens:
	if type(word_or_token) is Token:
	new_segments_and_tokens.append(word_or_token)
	all_words_and_tokens_pointer += 1
	else:
	break

	new_segments_and_tokens.append(Segment())

	while all_words_and_tokens_pointer < len(all_words_and_tokens):
	word_or_token = all_words_and_tokens[all_words_and_tokens_pointer]
	if type(word_or_token) is Word:

	# if this is going to be the first word in the segment, we definitely want
	# to add it to the segment
	if not new_segments_and_tokens[-1].words_and_tokens:
	new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)

	else:
	# if not the first word, check what the new length of the segment will be
	# if short enough - add this word to this segment;
	# if too long - add to a new segment
	this_word_n_chars = _get_word_n_chars(word_or_token)
	segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1])
	if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment:
	new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)
	else:
	new_segments_and_tokens.append(Segment())
	new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)

	else: # i.e. word_or_token is a token
	# currently this breaks the convention of tokens at the end/beginning
	# of segments being listed as separate tokens in segment.word_and_tokens
	# TODO: change code so we follow this convention
	new_segments_and_tokens[-1].words_and_tokens.append(word_or_token)

	all_words_and_tokens_pointer += 1

	utt_obj.segments_and_tokens = new_segments_and_tokens

	return utt_obj


	def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):

	default_style_dict = {
	"Name": "Default",
	"Fontname": "Arial",
	"Fontsize": str(ass_file_config.fontsize),
	"PrimaryColour": "&Hffffff",
	"SecondaryColour": "&Hffffff",
	"OutlineColour": "&H0",
	"BackColour": "&H0",
	"Bold": "0",
	"Italic": "0",
	"Underline": "0",
	"StrikeOut": "0",
	"ScaleX": "100",
	"ScaleY": "100",
	"Spacing": "0",
	"Angle": "0",
	"BorderStyle": "1",
	"Outline": "1",
	"Shadow": "0",
	"Alignment": None, # will specify below
	"MarginL": str(MARGINL),
	"MarginR": str(MARGINR),
	"MarginV": str(MARGINV),
	"Encoding": "0",
	}

	if ass_file_config.vertical_alignment == "top":
	default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen
	elif ass_file_config.vertical_alignment == "center":
	default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen
	elif ass_file_config.vertical_alignment == "bottom":
	default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen
	else:
	raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")

	output_dir = os.path.join(output_dir_root, "ass", "words")
	os.makedirs(output_dir, exist_ok=True)
	output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")

	already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
	being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
	not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"

	with open(output_file, 'w') as f:
	default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
	default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())

	f.write(
	(
	"[Script Info]\n"
	"ScriptType: v4.00+\n"
	f"PlayResX: {PLAYERRESX}\n"
	f"PlayResY: {PLAYERRESY}\n"
	"\n"
	"[V4+ Styles]\n"
	f"{default_style_top_line}\n"
	f"{default_style_bottom_line}\n"
	"\n"
	"[Events]\n"
	"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n"
	)
	)

	# write first set of subtitles for text before speech starts to be spoken
	words_in_first_segment = []
	for segment_or_token in utt_obj.segments_and_tokens:
	if type(segment_or_token) is Segment:
	first_segment = segment_or_token

	for word_or_token in first_segment.words_and_tokens:
	if type(word_or_token) is Word:
	words_in_first_segment.append(word_or_token)
	break

	text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}"
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,,"
	+ text_before_speech.rstrip()
	)

	f.write(subtitle_text + '\n')

	for segment_or_token in utt_obj.segments_and_tokens:
	if type(segment_or_token) is Segment:
	segment = segment_or_token

	words_in_segment = []
	for word_or_token in segment.words_and_tokens:
	if type(word_or_token) is Word:
	words_in_segment.append(word_or_token)

	for word_i, word in enumerate(words_in_segment):

	text_before = " ".join([x.text for x in words_in_segment[:word_i]])
	if text_before != "":
	text_before += " "
	text_before = already_spoken_color_code + text_before + r"{\r}"

	if word_i < len(words_in_segment) - 1:
	text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]])
	else:
	text_after = ""
	text_after = not_yet_spoken_color_code + text_after + r"{\r}"

	aligned_text = being_spoken_color_code + word.text + r"{\r}"
	aligned_text_off = already_spoken_color_code + word.text + r"{\r}"

	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,,"
	+ text_before
	+ aligned_text
	+ text_after.rstrip()
	)
	f.write(subtitle_text + '\n')

	# add subtitles without word-highlighting for when words are not being spoken
	if word_i < len(words_in_segment) - 1:
	last_word_end = float(words_in_segment[word_i].t_end)
	next_word_start = float(words_in_segment[word_i + 1].t_start)
	if next_word_start - last_word_end > 0.001:
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,,"
	+ text_before
	+ aligned_text_off
	+ text_after.rstrip()
	)
	f.write(subtitle_text + '\n')

	# write final set of subtitles for text after speech has been spoken
	words_in_final_segment = []
	for segment_or_token in utt_obj.segments_and_tokens[::-1]:
	if type(segment_or_token) is Segment:
	final_segment = segment_or_token

	for word_or_token in final_segment.words_and_tokens:
	if type(word_or_token) is Word:
	words_in_final_segment.append(word_or_token)
	break

	text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}"
	# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
	# longer than the original audio during the MP4 creation stage.
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
	+ text_after_speech.rstrip()
	)

	f.write(subtitle_text + '\n')

	utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file

	return utt_obj


	def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):

	default_style_dict = {
	"Name": "Default",
	"Fontname": "Arial",
	"Fontsize": str(ass_file_config.fontsize),
	"PrimaryColour": "&Hffffff",
	"SecondaryColour": "&Hffffff",
	"OutlineColour": "&H0",
	"BackColour": "&H0",
	"Bold": "0",
	"Italic": "0",
	"Underline": "0",
	"StrikeOut": "0",
	"ScaleX": "100",
	"ScaleY": "100",
	"Spacing": "0",
	"Angle": "0",
	"BorderStyle": "1",
	"Outline": "1",
	"Shadow": "0",
	"Alignment": None, # will specify below
	"MarginL": str(MARGINL),
	"MarginR": str(MARGINR),
	"MarginV": str(MARGINV),
	"Encoding": "0",
	}

	if ass_file_config.vertical_alignment == "top":
	default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen
	elif ass_file_config.vertical_alignment == "center":
	default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen
	elif ass_file_config.vertical_alignment == "bottom":
	default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen
	else:
	raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment")

	output_dir = os.path.join(output_dir_root, "ass", "tokens")
	os.makedirs(output_dir, exist_ok=True)
	output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass")

	already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}"
	being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}"
	not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}"

	with open(output_file, 'w') as f:
	default_style_top_line = "Format: " + ", ".join(default_style_dict.keys())
	default_style_bottom_line = "Style: " + ",".join(default_style_dict.values())

	f.write(
	(
	"[Script Info]\n"
	"ScriptType: v4.00+\n"
	f"PlayResX: {PLAYERRESX}\n"
	f"PlayResY: {PLAYERRESY}\n"
	"ScaledBorderAndShadow: yes\n"
	"\n"
	"[V4+ Styles]\n"
	f"{default_style_top_line}\n"
	f"{default_style_bottom_line}\n"
	"\n"
	"[Events]\n"
	"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n"
	)
	)

	# write first set of subtitles for text before speech starts to be spoken
	tokens_in_first_segment = []
	for segment_or_token in utt_obj.segments_and_tokens:
	if type(segment_or_token) is Segment:
	for word_or_token in segment_or_token.words_and_tokens:
	if type(word_or_token) is Token:
	if word_or_token.text != BLANK_TOKEN:
	tokens_in_first_segment.append(word_or_token)
	else:
	for token in word_or_token.tokens:
	if token.text != BLANK_TOKEN:
	tokens_in_first_segment.append(token)

	break

	for token in tokens_in_first_segment:
	token.text_cased = token.text_cased.replace(
	"▁", " "
	) # replace underscores used in subword tokens with spaces
	token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space

	text_before_speech = (
	not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}"
	)
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,,"
	+ text_before_speech.rstrip()
	)

	f.write(subtitle_text + '\n')

	for segment_or_token in utt_obj.segments_and_tokens:
	if type(segment_or_token) is Segment:
	segment = segment_or_token

	tokens_in_segment = [] # make list of (non-blank) tokens
	for word_or_token in segment.words_and_tokens:
	if type(word_or_token) is Token:
	if word_or_token.text != BLANK_TOKEN:
	tokens_in_segment.append(word_or_token)
	else:
	for token in word_or_token.tokens:
	if token.text != BLANK_TOKEN:
	tokens_in_segment.append(token)

	for token in tokens_in_segment:
	token.text_cased = token.text_cased.replace(
	"▁", " "
	) # replace underscores used in subword tokens with spaces
	token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space

	for token_i, token in enumerate(tokens_in_segment):

	text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]])
	text_before = already_spoken_color_code + text_before + r"{\r}"

	if token_i < len(tokens_in_segment) - 1:
	text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]])
	else:
	text_after = ""
	text_after = not_yet_spoken_color_code + text_after + r"{\r}"

	aligned_text = being_spoken_color_code + token.text_cased + r"{\r}"
	aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}"

	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,,"
	+ text_before
	+ aligned_text
	+ text_after.rstrip()
	)
	f.write(subtitle_text + '\n')

	# add subtitles without word-highlighting for when words are not being spoken
	if token_i < len(tokens_in_segment) - 1:
	last_token_end = float(tokens_in_segment[token_i].t_end)
	next_token_start = float(tokens_in_segment[token_i + 1].t_start)
	if next_token_start - last_token_end > 0.001:
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,,"
	+ text_before
	+ aligned_text_off
	+ text_after.rstrip()
	)
	f.write(subtitle_text + '\n')

	# Write final set of subtitles for text after speech has been spoken.
	# To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is.
	tokens_in_final_segment = []
	for segment_or_token in utt_obj.segments_and_tokens[::-1]:
	# Collect tokens from final segment - will 'break' so we only look at the final one.
	if type(segment_or_token) is Segment:
	# 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens'
	for word_or_token in segment_or_token.words_and_tokens:
	if type(word_or_token) is Token:
	if word_or_token.text != BLANK_TOKEN:
	tokens_in_final_segment.append(word_or_token)
	else:
	# 'word_or_token' is known to be a Word, which has attribute 'tokens'
	for token in word_or_token.tokens:
	if token.text != BLANK_TOKEN:
	tokens_in_final_segment.append(token)
	break

	for token in tokens_in_final_segment:
	token.text_cased = token.text_cased.replace(
	"▁", " "
	) # replace underscores used in subword tokens with spaces
	token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space

	text_after_speech = (
	already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}"
	)
	# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
	# longer than the original audio during the MP4 creation stage.
	subtitle_text = (
	f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
	+ text_after_speech.rstrip()
	)

	f.write(subtitle_text + '\n')

	utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file

	return utt_obj