forked from ttpro1995/TreeLSTMSentiment
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDependencyParse.java
More file actions
140 lines (124 loc) · 4.61 KB
/
Copy pathDependencyParse.java
File metadata and controls
140 lines (124 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.nndep.DependencyParser;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Properties;
import java.util.Scanner;
public class DependencyParse {
public static final String TAGGER_MODEL = "stanford-tagger/models/english-left3words-distsim.tagger";
public static final String PARSER_MODEL = "edu/stanford/nlp/models/parser/nndep/english_SD.gz";
public static void main(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
if (!props.containsKey("tokpath") ||
!props.containsKey("parentpath") ||
!props.containsKey("relpath")) {
System.err.println(
"usage: java DependencyParse -tokenize - -tokpath <tokpath> -parentpath <parentpath> -relpath <relpath>");
System.exit(1);
}
boolean tokenize = false;
if (props.containsKey("tokenize")) {
tokenize = true;
}
String tokPath = props.getProperty("tokpath");
String parentPath = props.getProperty("parentpath");
String relPath = props.getProperty("relpath");
BufferedWriter tokWriter = new BufferedWriter(new FileWriter(tokPath));
BufferedWriter parentWriter = new BufferedWriter(new FileWriter(parentPath));
BufferedWriter relWriter = new BufferedWriter(new FileWriter(relPath));
MaxentTagger tagger = new MaxentTagger(TAGGER_MODEL);
DependencyParser parser = DependencyParser.loadFromModelFile(PARSER_MODEL);
Scanner stdin = new Scanner(System.in);
int count = 0;
long start = System.currentTimeMillis();
while (stdin.hasNextLine()) {
String line = stdin.nextLine();
List<HasWord> tokens = new ArrayList<>();
if (tokenize) {
PTBTokenizer<Word> tokenizer = new PTBTokenizer(
new StringReader(line), new WordTokenFactory(), "");
for (Word label; tokenizer.hasNext(); ) {
tokens.add(tokenizer.next());
}
} else {
for (String word : line.split(" ")) {
tokens.add(new Word(word));
}
}
List<TaggedWord> tagged = tagger.tagSentence(tokens);
int len = tagged.size();
Collection<TypedDependency> tdl = parser.predict(tagged).typedDependencies();
int[] parents = new int[len];
for (int i = 0; i < len; i++) {
// if a node has a parent of -1 at the end of parsing, then the node
// has no parent.
parents[i] = -1;
}
String[] relns = new String[len];
for (TypedDependency td : tdl) {
// let root have index 0
int child = td.dep().index();
int parent = td.gov().index();
relns[child - 1] = td.reln().toString();
parents[child - 1] = parent;
}
// print tokens
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
} else {
sb.append(tokens.get(i).word());
}
sb.append(' ');
}
if (tokenize) {
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
} else {
sb.append(tokens.get(len - 1).word());
}
sb.append('\n');
tokWriter.write(sb.toString());
// print parent pointers
sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
sb.append(parents[i]);
sb.append(' ');
}
sb.append(parents[len - 1]);
sb.append('\n');
parentWriter.write(sb.toString());
// print relations
sb = new StringBuilder();
for (int i = 0; i < len - 1; i++) {
sb.append(relns[i]);
sb.append(' ');
}
sb.append(relns[len - 1]);
sb.append('\n');
relWriter.write(sb.toString());
count++;
if (count % 1000 == 0) {
double elapsed = (System.currentTimeMillis() - start) / 1000.0;
System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
}
}
long totalTimeMillis = System.currentTimeMillis() - start;
System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
tokWriter.close();
parentWriter.close();
relWriter.close();
}
}