[Yanel-commits] rev 23301 - in
public/yanel/trunk/src/resources/add-realm/src: build
java/org/wyona/yanel/impl/resources
michi at wyona.com
michi at wyona.com
Mon Mar 19 15:10:24 CET 2007
Author: michi
Date: 2007-03-19 15:10:22 +0100 (Mon, 19 Mar 2007)
New Revision: 23301
Modified:
public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
Log:
various bugs fixed, e.g. correct mime-type etc
Modified: public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml 2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml 2007-03-19 14:10:22 UTC (rev 23301)
@@ -14,7 +14,7 @@
<artifact:dependencies pathId="maven2.resource.classpath" filesetId="maven2.resource.fileset">
<remoteRepository refid="wyona.remote.repository"/>
- <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23135"/>
+ <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23281"/>
<dependency groupId="apache-jakarta-commons" artifactId="apache-jakarta-commons-io" version="1.2"/>
</artifact:dependencies>
Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java 2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java 2007-03-19 14:10:22 UTC (rev 23301)
@@ -45,7 +45,13 @@
public String getDownloadEvents() {
StringBuffer buf = new StringBuffer();
for (int i=0; i<this.downloadEvents.size(); i++) {
- buf.append(this.downloadEvents.get(i) + "\n");
+ LinkEvent event = (LinkEvent)this.downloadEvents.get(i);
+ String desc = event.toString().replaceAll(">", ">");
+ desc = desc.replaceAll("<", "<");
+ desc = desc.replaceAll("'", "'");
+ desc = desc.replaceAll("\"", """);
+ desc = desc.replaceAll("&", "&");
+ buf.append(desc + "\n");
}
return buf.toString();
}
Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java 2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java 2007-03-19 14:10:22 UTC (rev 23301)
@@ -4,14 +4,19 @@
package org.wyona.yanel.impl.resources;
+import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
+import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
@@ -37,11 +42,16 @@
private String crawlStartURL;
private Realm realm;
+ private HashMap mimeTypeMap;
+ private HashMap encodingMap;
+
public ImportSiteThread(DumpingCrawler crawler, Realm realm, String dumpDir, String crawlStartURL) {
this.crawler = crawler;
this.realm = realm;
this.dumpDir = dumpDir;
this.crawlStartURL = crawlStartURL;
+ this.mimeTypeMap = new HashMap();
+ this.encodingMap = new HashMap();
}
public void run() {
@@ -54,6 +64,7 @@
deleteRepositoryContent(realm.getRepository());
deleteRepositoryContent(realm.getRTIRepository());
Node root = realm.getRepository().getRootNode();
+ readMeta();
importContent(new File(dumpDir), root);
// remove temp dump dir
@@ -68,6 +79,29 @@
}
/**
+ * Reads the .meta file of the dump which contains the mimetypes
+ * and the encoding of the dumped files.
+ * @throws IOException
+ */
+ protected void readMeta() throws IOException {
+ File meta = new File(this.dumpDir + File.separator + ".meta");
+ BufferedReader reader = new BufferedReader(new FileReader(meta));
+ String line;
+
+ while ((line = reader.readLine()) != null) {
+ String[] tokens = line.split(",");
+ // pattern is: path,mimetype[,encoding]
+ String path = tokens[0];
+ String mimeType = tokens[1];
+ this.mimeTypeMap.put(path, mimeType);
+ if (tokens.length > 2) {
+ String encoding = tokens[2];
+ this.encodingMap.put(path, encoding);
+ }
+ }
+ }
+
+ /**
* Imports the content of the given directory into the repository as child nodes
* of the given node. This will recursively add the complete subtree.
* If a Node already exists in the repository, it will be overwritten.
@@ -91,16 +125,30 @@
// recursion:
importContent(file, childNode);
} else {
+ if (name.equals(".meta")) {
+ continue; // don't import the dump meta file
+ }
if (node.hasNode(name)) {
childNode = node.getNode(name);
} else {
childNode = node.addNode(name, NodeType.RESOURCE);
}
- String mimeType = guessMimeType(FilenameUtils.getExtension(file.getName()));
+ String mimeType;
+ String path = getLocalPath(file);
+ if (this.mimeTypeMap.containsKey(path)) {
+ mimeType = (String)this.mimeTypeMap.get(path);
+ } else {
+ mimeType = guessMimeType(FilenameUtils.getExtension(file.getName()));
+ }
InputStream is = new FileInputStream(file);
OutputStream os = childNode.getOutputStream();
- if (mimeType.equals("text/html")) {
- addIntrospectionLink(is, os);
+ if (mimeType.equals("text/html") || mimeType.equals("application/xhtml+xml")) {
+ String encoding = "utf-8";
+ if (this.encodingMap.containsKey(path)) {
+ encoding = (String)this.encodingMap.get(path);
+ }
+ addIntrospectionLink(is, os, encoding);
+ childNode.setEncoding(encoding);
} else {
byte[] buf = new byte[8192];
int bytesRead;
@@ -117,42 +165,64 @@
}
/**
+ * Returns the local path of a dumped file, i.e. the path relative
+ * to the root dir of the dump.
+ * @param file
+ * @return
+ * @throws IOException
+ */
+ protected String getLocalPath(File file) throws IOException {
+ String rootPath = new File(this.dumpDir).getCanonicalPath();
+ String path = file.getCanonicalPath();
+ if (!path.startsWith(rootPath)) {
+ throw new IOException("Path " + path + " must be inside of " + rootPath);
+ }
+ return path.substring(rootPath.length()+1);
+ }
+
+ /**
* Adds a yanel introspection link element to the head element of the current page.
- * Note: this method is stream based and does not consider character encoding, therefore
- * it works only for data with ascii-compatible encoding like utf-8 or iso-8859-1.
- * TODO: remove existing introspection link if imported page already has one
* @param is stream of the source html page
* @param os stream of the result html page
+ * @param encoding the encoding of the input stream
* @throws IOException
*/
- protected void addIntrospectionLink(InputStream is, OutputStream os) throws IOException {
- int b;
+ protected void addIntrospectionLink(InputStream is, OutputStream os, String encoding) throws IOException {
+ int c;
int state = OUTSIDE_TAG;
- StringBuffer tagNameBuf = null;
- while ((b = is.read()) != -1) {
+ InputStreamReader reader = new InputStreamReader(is, encoding);
+ OutputStreamWriter writer = new OutputStreamWriter(os, encoding);
+
+ StringBuffer tagBuf = null;
+ while ((c = reader.read()) != -1) {
switch (state) {
case OUTSIDE_TAG:
- if (b == '<') {
- tagNameBuf = new StringBuffer();
+ if (c == '<') {
+ tagBuf = new StringBuffer("<");
state = INSIDE_TAG;
+ } else {
+ writer.write(c);
}
- os.write(b);
break;
case INSIDE_TAG:
- os.write(b);
- if (b == '>') {
+ tagBuf.append((char)c);
+ if (c == '>') {
state = OUTSIDE_TAG;
- String tagName = tagNameBuf.toString();
- if (tagName.startsWith("head")) {
+ String tag = tagBuf.toString();
+ if (!tag.startsWith("<link") || tag.indexOf("neutron-introspection") == -1) {
+ writer.write(tag, 0, tag.length());
+ }
+ if (tag.startsWith("<head")) {
String introspectionLink = "<link rel=\"neutron-introspection\" type=\"application/neutron+xml\" href=\"?yanel.resource.usecase=introspection\"/>";
- os.write(introspectionLink.getBytes());
+ writer.write(introspectionLink, 0, introspectionLink.length());
}
- } else {
- tagNameBuf.append((char)b);
}
break;
}
}
+ writer.flush();
+ writer.close();
+ reader.close();
}
/**
More information about the Yanel-commits
mailing list