Skip to content

Commit 47740ee

Browse files
Added code files
1 parent 84569cb commit 47740ee

File tree

1,782 files changed

+146136
-9
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,782 files changed

+146136
-9
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@
99
*.user
1010
*.userosscache
1111
*.sln.docstates
12-
12+
.vscode
13+
CodeMixed-Text-Generator/alignment_generator/fast_align-master/
14+
CodeMixed-Text-Generator/stanford_parser/stanford_parser_full_2017_06_09/
15+
CodeMixed-Text-Generator/data/
16+
CodeMixed-Text-Generator/web/static/images/
1317
# User-specific files (MonoDevelop/Xamarin Studio)
1418
*.userprefs
1519

CodeMixed-Text-Generator/aligner.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import os
2+
import logging
3+
import tempfile
4+
import subprocess
5+
import re
6+
7+
from configparser import ConfigParser
8+
9+
import numpy as np
10+
11+
def get_config():
12+
config = ConfigParser()
13+
config.read("config.ini")
14+
config_aligner = config["ALIGNER"]
15+
config_general = config["GENERAL"]
16+
return config_general, config_aligner
17+
18+
def read_data(input_loc, lang1_in_file, lang2_in_file):
19+
with open(os.path.join(input_loc, lang1_in_file), "r") as f:
20+
lang1_in = f.read().strip().split("\n")
21+
with open(os.path.join(input_loc, lang2_in_file), "r") as f:
22+
lang2_in = f.read().strip().split("\n")
23+
return lang1_in, lang2_in
24+
25+
def clean_sentence(sent):
26+
return re.sub(r"[()]", "", re.sub(r"\s+", " ", re.sub(r"([?!,.])", r" \1 ", sent))).strip()
27+
28+
def write_pfms(indices, total_length, pfms_file, lang1_code, lang2_code):
29+
if not pfms_file:
30+
pfms_scores = ["0.0" for _ in range(total_length)]
31+
else:
32+
with open(os.path.join(input_loc, pfms_file), "r") as f:
33+
lines = f.read().split("\n")
34+
pfms_scores = []
35+
for index in indices:
36+
pfms_scores.append(lines[index])
37+
38+
pfms_file = lang1_code + "-to-" + lang2_code + "_pfms.txt"
39+
with open(os.path.join(output_loc, pfms_file), "w") as f:
40+
f.write("\n".join(pfms_scores))
41+
42+
def fast_align(pfms_file, lang1_code, lang2_code, align_op_file, lang1_op_file, lang2_op_file):
43+
total_length = len(lang1_in)
44+
45+
with tempfile.TemporaryDirectory() as tmpdir:
46+
root_dir = os.getcwd()
47+
48+
with open("{}/parallel".format(tmpdir), "w") as f:
49+
lines = ["{} ||| {}".format(lang1_s, lang2_s) for lang1_s, lang2_s in zip(lang1_in, lang2_in)]
50+
f.write("\n".join(lines))
51+
52+
path_to_fast_align = "{}/alignment_generator/fast_align-master/build".format(root_dir)
53+
subprocess_call = ["{}/fast_align".format(path_to_fast_align),
54+
"-i",
55+
"{}/parallel".format(tmpdir),
56+
"-o",
57+
"-v"
58+
]
59+
os.makedirs("{}/fast_align".format(tmpdir), exist_ok=True)
60+
with open("{}/fast_align/forward.align".format(tmpdir), "w") as stdout:
61+
subprocess.run(subprocess_call, stdout=stdout)
62+
subprocess_call.append("-r")
63+
with open("{}/fast_align/reverse.align".format(tmpdir), "w") as stdout:
64+
subprocess.run(subprocess_call, stdout=stdout)
65+
subprocess_call = ["{}/atools".format(path_to_fast_align),
66+
"-c",
67+
"grow-diag-final-and",
68+
"-i",
69+
"{}/fast_align/forward.align".format(tmpdir),
70+
"-j",
71+
"{}/fast_align/reverse.align".format(tmpdir)
72+
]
73+
with open("{}/alignments".format(tmpdir), "w") as stdout:
74+
subprocess.run(subprocess_call, stdout=stdout)
75+
logger.info("Generating Alignments Done")
76+
77+
with open("{}/alignments".format(tmpdir), "r") as f:
78+
align_in = f.read().split("\n")
79+
80+
indices = [_ for _ in range(total_length)]
81+
lang1_op = []
82+
lang2_op = []
83+
align_op = []
84+
85+
for index in indices:
86+
lang1_op.append(lang1_in[index])
87+
lang2_op.append(lang2_in[index])
88+
align_op.append(align_in[index])
89+
90+
os.makedirs(output_loc, exist_ok=True)
91+
with open(os.path.join(output_loc, lang1_op_file), "w") as f:
92+
f.write("\n".join(lang1_op))
93+
with open(os.path.join(output_loc, lang2_op_file), "w") as f:
94+
f.write("\n".join(lang2_op))
95+
with open(os.path.join(output_loc, align_op_file), "w") as f:
96+
f.write("\n".join(align_op))
97+
98+
# Writes PFMS file
99+
write_pfms(indices, total_length, pfms_file, lang1_code, lang2_code)
100+
101+
if __name__ == "__main__":
102+
# setup logging
103+
logger = logging.getLogger(__name__)
104+
105+
# setup file paths using config file
106+
config_general, config_aligner = get_config()
107+
lang1 = config_general["language_1"] if config_general["language_1"] else "HINDI"
108+
lang1_code = lang1.lower()[:2]
109+
lang2 = config_general["language_2"] if config_general["language_2"] else "ENGLISH"
110+
lang2_code = lang2.lower()[:2]
111+
input_loc = config_general["input_loc"] if config_general["input_loc"] else "data"
112+
output_loc = config_general["output_loc"] if config_general["output_loc"] else "data"
113+
lang1_in_file = config_aligner["source_inp_file"] if config_aligner["source_inp_file"] else lang1_code + "-to-" + lang2_code + "-input_lang1"
114+
lang2_in_file = config_aligner["target_inp_file"] if config_aligner["target_inp_file"] else lang1_code + "-to-" + lang2_code + "-input_lang2"
115+
lang1_op_file = config_aligner["source_op_file"] if config_aligner["source_op_file"] else lang1_in_file
116+
lang2_op_file = config_aligner["target_op_file"] if config_aligner["target_op_file"] else lang2_in_file
117+
pfms_file = config_aligner["pfms_file"]
118+
align_op_file = config_aligner["align_op_file"] if config_aligner["align_op_file"] else lang1_code + "-to-" + lang2_code + "-input_parallel_alignments"
119+
120+
# read data
121+
lang1_in, lang2_in = read_data(input_loc, lang1_in_file, lang2_in_file)
122+
123+
# clean data
124+
for i in range(len(lang1_in)):
125+
lang1_in[i] = clean_sentence(lang1_in[i])
126+
for i in range(len(lang2_in)):
127+
lang2_in[i] = clean_sentence(lang2_in[i])
128+
129+
# check if both files are of equal length
130+
try:
131+
assert len(lang1_in) == len(lang2_in)
132+
except AssertionError:
133+
logger.error("Mismatch in length of {lang1} ({lang1_len}) and {lang2} ({lang2_len}) \
134+
sentences".format(lang1 = lang1, lang1_len = len(lang1_in), lang2 = lang2, lang2_len = len(lang2_in)))
135+
exit()
136+
137+
# Learn alignments on all sentences
138+
fast_align(pfms_file, lang1_code, lang2_code, align_op_file, lang1_op_file, lang2_op_file)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## Word alignment generator
2+
3+
# Citation
4+
* [Chris Dyer](http://www.cs.cmu.edu/~cdyer), [Victor Chahuneau](http://victor.chahuneau.fr), and [Noah A. Smith](http://www.cs.cmu.edu/~nasmith). (2013). [A Simple, Fast, and Effective Reparameterization of IBM Model 2](http://www.ark.cs.cmu.edu/cdyer/fast_valign.pdf). In *Proc. of NAACL*.
5+
6+
The source code for this aligner was obtained from the author(s) repo: [fast-align](https://github.com/clab/fast_align).
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import urllib.request
2+
import zipfile
3+
import os
4+
import subprocess
5+
6+
def install_fast_align():
7+
8+
# fetch fast_align from github
9+
url = "https://github.com/clab/fast_align/archive/master.zip"
10+
print("fetching fast_align from web")
11+
res = urllib.request.urlopen(url)
12+
13+
# save the fetched data as a file
14+
with open("master.zip", "wb") as f:
15+
f.write(res.read())
16+
print("extracting fast_align from downloaded zip file")
17+
with zipfile.ZipFile("master.zip", "r") as zip_ref:
18+
zip_ref.extractall(".")
19+
20+
# create build directory
21+
os.chdir("fast_align-master/")
22+
os.mkdir("build")
23+
os.chdir("build")
24+
25+
# setup make
26+
print("setting up make")
27+
p_cmake = subprocess.run(["cmake", ".."], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
28+
print((p_cmake.stdout).decode())
29+
30+
# make to compile fast_align
31+
print("compiling fast_align")
32+
print(os.getcwd())
33+
p_make = subprocess.run(["make"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
34+
print((p_make.stdout).decode())
35+
36+
# get rid of extra files
37+
print("fast_align is successfully installed. Now cleaning up")
38+
os.chdir("../../")
39+
os.remove("master.zip")
40+
print("Done.")
41+
42+
if __name__ == "__main__":
43+
try:
44+
install_fast_align()
45+
except Exception as err:
46+
print(err)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
output/
2+
testcases/

CodeMixed-Text-Generator/cm_text_generator/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)