-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathProcessXML.java
More file actions
92 lines (70 loc) · 3.1 KB
/
ProcessXML.java
File metadata and controls
92 lines (70 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
package fy.nlp;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.util.List;
public class ProcessXML {
public static String processText(String text) {
// do some thing with text
return text;
}
/**
* 通过结点快速遍历XML,修改结点文本,输出XML
* @param xmlFilePath
* @param resultFilePath
*/
public static void transformXML(String xmlFilePath, String resultFilePath) {
try{
DocumentBuilderFactory dbf =DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = dbf.newDocumentBuilder();
Document doc = docBuilder.parse(new File(xmlFilePath));
Node firstItemNode = doc.getFirstChild().getFirstChild();
// nodes = doc.getFirstChild().getChildNodes();
// System.out.println("总共有"+nodes.getLength()+"个item");
int count = 1;
for (Node itemNode = firstItemNode; itemNode != null;
itemNode = itemNode.getNextSibling()) {
// System.out.println(itemNode.getTextContent());
for (Node childNode = itemNode.getFirstChild();
childNode != null; childNode = childNode.getNextSibling()) {
if (childNode.getNodeType() != Node.ELEMENT_NODE){
continue;
}
if(childNode.getNodeName().equals("text")){
childNode.setTextContent(transformTokens(childNode.getTextContent()));
}
if(childNode.getNodeName().equals("summary")){
childNode.setTextContent(transformTokens(childNode.getTextContent()));
}
}
if (count % 2000 == 0) {
System.out.println(count + " nodes complete...");
}
count ++;
}
System.out.println("Start output XML file");
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformerFactory.setAttribute("indent-number", 2);
Transformer transformer = transformerFactory.newTransformer();
// transformer.setOutputProperty(OutputKeys.INDENT, "yes");
DOMSource source = new DOMSource(doc);
StreamResult result = new StreamResult(new File(resultFilePath));
transformer.transform(source, result);
}catch(Exception e){
e.printStackTrace();
}
}
public static void main(String[] args) {
String xmlFilePath = "/Users/fangy/data/cl14-unprocessed/jp/music/unlabeled.review";
String xmlResultFilePath = "unlabeled.review";
transformXML(xmlFilePath, xmlResultFilePath);
}
}