Spaces:
Runtime error
Runtime error
| import os | |
| from tqdm import tqdm | |
| import sys | |
| LANGS = [ | |
| "as", | |
| "bn", | |
| "gu", | |
| "hi", | |
| "kn", | |
| "ml", | |
| "mr", | |
| "or", | |
| "pa", | |
| "ta", | |
| "te", | |
| #"ur" | |
| ] | |
| def add_token(sent, tag_infos): | |
| """ add special tokens specified by tag_infos to each element in list | |
| tag_infos: list of tuples (tag_type,tag) | |
| each tag_info results in a token of the form: __{tag_type}__{tag}__ | |
| """ | |
| tokens = [] | |
| for tag_type, tag in tag_infos: | |
| token = '__' + tag_type + '__' + tag + '__' | |
| tokens.append(token) | |
| return ' '.join(tokens) + ' ' + sent | |
| def concat_data(data_dir, outdir, lang_pair_list, | |
| out_src_lang='SRC', out_trg_lang='TGT', split='train'): | |
| """ | |
| data_dir: input dir, contains directories for language pairs named l1-l2 | |
| """ | |
| os.makedirs(outdir, exist_ok=True) | |
| out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang) | |
| out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang) | |
| # out_meta_fname='{}/metadata.txt'.format(outdir) | |
| print() | |
| print(out_src_fname) | |
| print(out_trg_fname) | |
| # print(out_meta_fname) | |
| # concatenate train data | |
| if os.path.isfile(out_src_fname): | |
| os.unlink(out_src_fname) | |
| if os.path.isfile(out_trg_fname): | |
| os.unlink(out_trg_fname) | |
| # if os.path.isfile(out_meta_fname): | |
| # os.unlink(out_meta_fname) | |
| for src_lang, trg_lang in tqdm(lang_pair_list): | |
| print('src: {}, tgt:{}'.format(src_lang, trg_lang)) | |
| in_src_fname = '{}/{}-{}/{}.{}'.format( | |
| data_dir, src_lang, trg_lang, split, src_lang) | |
| in_trg_fname = '{}/{}-{}/{}.{}'.format( | |
| data_dir, src_lang, trg_lang, split, trg_lang) | |
| if not os.path.exists(in_src_fname): | |
| continue | |
| if not os.path.exists(in_trg_fname): | |
| continue | |
| print(in_src_fname) | |
| os.system('cat {} >> {}'.format(in_src_fname, out_src_fname)) | |
| print(in_trg_fname) | |
| os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname)) | |
| # with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: | |
| # lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] )) | |
| corpus_stats(data_dir, outdir, lang_pair_list, split) | |
| def corpus_stats(data_dir, outdir, lang_pair_list, split): | |
| """ | |
| data_dir: input dir, contains directories for language pairs named l1-l2 | |
| """ | |
| with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile: | |
| for src_lang, trg_lang in tqdm(lang_pair_list): | |
| print('src: {}, tgt:{}'.format(src_lang, trg_lang)) | |
| in_src_fname = '{}/{}-{}/{}.{}'.format( | |
| data_dir, src_lang, trg_lang, split, src_lang) | |
| # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang) | |
| if not os.path.exists(in_src_fname): | |
| continue | |
| print(in_src_fname) | |
| corpus_size = 0 | |
| with open(in_src_fname, 'r', encoding='utf-8') as infile: | |
| corpus_size = sum(map(lambda x: 1, infile)) | |
| lpfile.write('{}\t{}\t{}\n'.format( | |
| src_lang, trg_lang, corpus_size)) | |
| if __name__ == '__main__': | |
| in_dir = sys.argv[1] | |
| out_dir = sys.argv[2] | |
| src_lang = sys.argv[3] | |
| tgt_lang = sys.argv[4] | |
| split = sys.argv[5] | |
| lang_pair_list = [] | |
| if src_lang == 'en': | |
| for lang in LANGS: | |
| lang_pair_list.append(['en', lang]) | |
| else: | |
| for lang in LANGS: | |
| lang_pair_list.append([lang, 'en']) | |
| concat_data(in_dir, out_dir, lang_pair_list, split=split) | |