|
| 1 | +import os |
| 2 | +import logging |
| 3 | +import tempfile |
| 4 | +import subprocess |
| 5 | +import re |
| 6 | + |
| 7 | +from configparser import ConfigParser |
| 8 | + |
| 9 | +import numpy as np |
| 10 | + |
| 11 | +def get_config(): |
| 12 | + config = ConfigParser() |
| 13 | + config.read("config.ini") |
| 14 | + config_aligner = config["ALIGNER"] |
| 15 | + config_general = config["GENERAL"] |
| 16 | + return config_general, config_aligner |
| 17 | + |
| 18 | +def read_data(input_loc, lang1_in_file, lang2_in_file): |
| 19 | + with open(os.path.join(input_loc, lang1_in_file), "r") as f: |
| 20 | + lang1_in = f.read().strip().split("\n") |
| 21 | + with open(os.path.join(input_loc, lang2_in_file), "r") as f: |
| 22 | + lang2_in = f.read().strip().split("\n") |
| 23 | + return lang1_in, lang2_in |
| 24 | + |
| 25 | +def clean_sentence(sent): |
| 26 | + return re.sub(r"[()]", "", re.sub(r"\s+", " ", re.sub(r"([?!,.])", r" \1 ", sent))).strip() |
| 27 | + |
| 28 | +def write_pfms(indices, total_length, pfms_file, lang1_code, lang2_code): |
| 29 | + if not pfms_file: |
| 30 | + pfms_scores = ["0.0" for _ in range(total_length)] |
| 31 | + else: |
| 32 | + with open(os.path.join(input_loc, pfms_file), "r") as f: |
| 33 | + lines = f.read().split("\n") |
| 34 | + pfms_scores = [] |
| 35 | + for index in indices: |
| 36 | + pfms_scores.append(lines[index]) |
| 37 | + |
| 38 | + pfms_file = lang1_code + "-to-" + lang2_code + "_pfms.txt" |
| 39 | + with open(os.path.join(output_loc, pfms_file), "w") as f: |
| 40 | + f.write("\n".join(pfms_scores)) |
| 41 | + |
| 42 | +def fast_align(pfms_file, lang1_code, lang2_code, align_op_file, lang1_op_file, lang2_op_file): |
| 43 | + total_length = len(lang1_in) |
| 44 | + |
| 45 | + with tempfile.TemporaryDirectory() as tmpdir: |
| 46 | + root_dir = os.getcwd() |
| 47 | + |
| 48 | + with open("{}/parallel".format(tmpdir), "w") as f: |
| 49 | + lines = ["{} ||| {}".format(lang1_s, lang2_s) for lang1_s, lang2_s in zip(lang1_in, lang2_in)] |
| 50 | + f.write("\n".join(lines)) |
| 51 | + |
| 52 | + path_to_fast_align = "{}/alignment_generator/fast_align-master/build".format(root_dir) |
| 53 | + subprocess_call = ["{}/fast_align".format(path_to_fast_align), |
| 54 | + "-i", |
| 55 | + "{}/parallel".format(tmpdir), |
| 56 | + "-o", |
| 57 | + "-v" |
| 58 | + ] |
| 59 | + os.makedirs("{}/fast_align".format(tmpdir), exist_ok=True) |
| 60 | + with open("{}/fast_align/forward.align".format(tmpdir), "w") as stdout: |
| 61 | + subprocess.run(subprocess_call, stdout=stdout) |
| 62 | + subprocess_call.append("-r") |
| 63 | + with open("{}/fast_align/reverse.align".format(tmpdir), "w") as stdout: |
| 64 | + subprocess.run(subprocess_call, stdout=stdout) |
| 65 | + subprocess_call = ["{}/atools".format(path_to_fast_align), |
| 66 | + "-c", |
| 67 | + "grow-diag-final-and", |
| 68 | + "-i", |
| 69 | + "{}/fast_align/forward.align".format(tmpdir), |
| 70 | + "-j", |
| 71 | + "{}/fast_align/reverse.align".format(tmpdir) |
| 72 | + ] |
| 73 | + with open("{}/alignments".format(tmpdir), "w") as stdout: |
| 74 | + subprocess.run(subprocess_call, stdout=stdout) |
| 75 | + logger.info("Generating Alignments Done") |
| 76 | + |
| 77 | + with open("{}/alignments".format(tmpdir), "r") as f: |
| 78 | + align_in = f.read().split("\n") |
| 79 | + |
| 80 | + indices = [_ for _ in range(total_length)] |
| 81 | + lang1_op = [] |
| 82 | + lang2_op = [] |
| 83 | + align_op = [] |
| 84 | + |
| 85 | + for index in indices: |
| 86 | + lang1_op.append(lang1_in[index]) |
| 87 | + lang2_op.append(lang2_in[index]) |
| 88 | + align_op.append(align_in[index]) |
| 89 | + |
| 90 | + os.makedirs(output_loc, exist_ok=True) |
| 91 | + with open(os.path.join(output_loc, lang1_op_file), "w") as f: |
| 92 | + f.write("\n".join(lang1_op)) |
| 93 | + with open(os.path.join(output_loc, lang2_op_file), "w") as f: |
| 94 | + f.write("\n".join(lang2_op)) |
| 95 | + with open(os.path.join(output_loc, align_op_file), "w") as f: |
| 96 | + f.write("\n".join(align_op)) |
| 97 | + |
| 98 | + # Writes PFMS file |
| 99 | + write_pfms(indices, total_length, pfms_file, lang1_code, lang2_code) |
| 100 | + |
| 101 | +if __name__ == "__main__": |
| 102 | + # setup logging |
| 103 | + logger = logging.getLogger(__name__) |
| 104 | + |
| 105 | + # setup file paths using config file |
| 106 | + config_general, config_aligner = get_config() |
| 107 | + lang1 = config_general["language_1"] if config_general["language_1"] else "HINDI" |
| 108 | + lang1_code = lang1.lower()[:2] |
| 109 | + lang2 = config_general["language_2"] if config_general["language_2"] else "ENGLISH" |
| 110 | + lang2_code = lang2.lower()[:2] |
| 111 | + input_loc = config_general["input_loc"] if config_general["input_loc"] else "data" |
| 112 | + output_loc = config_general["output_loc"] if config_general["output_loc"] else "data" |
| 113 | + lang1_in_file = config_aligner["source_inp_file"] if config_aligner["source_inp_file"] else lang1_code + "-to-" + lang2_code + "-input_lang1" |
| 114 | + lang2_in_file = config_aligner["target_inp_file"] if config_aligner["target_inp_file"] else lang1_code + "-to-" + lang2_code + "-input_lang2" |
| 115 | + lang1_op_file = config_aligner["source_op_file"] if config_aligner["source_op_file"] else lang1_in_file |
| 116 | + lang2_op_file = config_aligner["target_op_file"] if config_aligner["target_op_file"] else lang2_in_file |
| 117 | + pfms_file = config_aligner["pfms_file"] |
| 118 | + align_op_file = config_aligner["align_op_file"] if config_aligner["align_op_file"] else lang1_code + "-to-" + lang2_code + "-input_parallel_alignments" |
| 119 | + |
| 120 | + # read data |
| 121 | + lang1_in, lang2_in = read_data(input_loc, lang1_in_file, lang2_in_file) |
| 122 | + |
| 123 | + # clean data |
| 124 | + for i in range(len(lang1_in)): |
| 125 | + lang1_in[i] = clean_sentence(lang1_in[i]) |
| 126 | + for i in range(len(lang2_in)): |
| 127 | + lang2_in[i] = clean_sentence(lang2_in[i]) |
| 128 | + |
| 129 | + # check if both files are of equal length |
| 130 | + try: |
| 131 | + assert len(lang1_in) == len(lang2_in) |
| 132 | + except AssertionError: |
| 133 | + logger.error("Mismatch in length of {lang1} ({lang1_len}) and {lang2} ({lang2_len}) \ |
| 134 | + sentences".format(lang1 = lang1, lang1_len = len(lang1_in), lang2 = lang2, lang2_len = len(lang2_in))) |
| 135 | + exit() |
| 136 | + |
| 137 | + # Learn alignments on all sentences |
| 138 | + fast_align(pfms_file, lang1_code, lang2_code, align_op_file, lang1_op_file, lang2_op_file) |
0 commit comments