[Yanel-commits] rev 23301 - in public/yanel/trunk/src/resources/add-realm/src: build java/org/wyona/yanel/impl/resources

michi at wyona.com michi at wyona.com
Mon Mar 19 15:10:24 CET 2007


Author: michi
Date: 2007-03-19 15:10:22 +0100 (Mon, 19 Mar 2007)
New Revision: 23301

Modified:
   public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
   public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
   public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
Log:
various bugs fixed, e.g. correct mime-type etc

Modified: public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml	2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml	2007-03-19 14:10:22 UTC (rev 23301)
@@ -14,7 +14,7 @@
 
     <artifact:dependencies pathId="maven2.resource.classpath" filesetId="maven2.resource.fileset">
       <remoteRepository refid="wyona.remote.repository"/>
-      <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23135"/>
+      <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23281"/>
       <dependency groupId="apache-jakarta-commons" artifactId="apache-jakarta-commons-io" version="1.2"/>
 
     </artifact:dependencies>

Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java	2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java	2007-03-19 14:10:22 UTC (rev 23301)
@@ -45,7 +45,13 @@
     public String getDownloadEvents() {
         StringBuffer buf = new StringBuffer();
         for (int i=0; i<this.downloadEvents.size(); i++) {
-            buf.append(this.downloadEvents.get(i) + "\n");
+            LinkEvent event = (LinkEvent)this.downloadEvents.get(i);
+            String desc = event.toString().replaceAll(">", "&gt;");
+            desc = desc.replaceAll("<", "&lt;");
+            desc = desc.replaceAll("'", "&apos;");
+            desc = desc.replaceAll("\"", "&quot;");
+            desc = desc.replaceAll("&", "&amp;");
+            buf.append(desc + "\n");
         }
         return buf.toString();
     }

Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java	2007-03-19 13:55:55 UTC (rev 23300)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java	2007-03-19 14:10:22 UTC (rev 23301)
@@ -4,14 +4,19 @@
 
 package org.wyona.yanel.impl.resources;
 
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.net.URL;
+import java.util.HashMap;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
@@ -37,11 +42,16 @@
     private String crawlStartURL;
     private Realm realm;
     
+    private HashMap mimeTypeMap;
+    private HashMap encodingMap;
+    
     public ImportSiteThread(DumpingCrawler crawler, Realm realm, String dumpDir, String crawlStartURL) {
         this.crawler = crawler;
         this.realm = realm;
         this.dumpDir = dumpDir;
         this.crawlStartURL = crawlStartURL;
+        this.mimeTypeMap = new HashMap();
+        this.encodingMap = new HashMap();
     }
     
     public void run() {
@@ -54,6 +64,7 @@
             deleteRepositoryContent(realm.getRepository());
             deleteRepositoryContent(realm.getRTIRepository());
             Node root = realm.getRepository().getRootNode();
+            readMeta();
             importContent(new File(dumpDir), root);
             
             // remove temp dump dir
@@ -68,6 +79,29 @@
     }
     
     /**
+     * Reads the .meta file of the dump which contains the mimetypes
+     * and the encoding of the dumped files.
+     * @throws IOException
+     */
+    protected void readMeta() throws IOException {
+        File meta = new File(this.dumpDir + File.separator + ".meta");
+        BufferedReader reader = new BufferedReader(new FileReader(meta));
+        String line;
+        
+        while ((line = reader.readLine()) != null) {
+            String[] tokens = line.split(",");
+            // pattern is: path,mimetype[,encoding]
+            String path = tokens[0];
+            String mimeType = tokens[1];
+            this.mimeTypeMap.put(path, mimeType);
+            if (tokens.length > 2) {
+                String encoding = tokens[2];
+                this.encodingMap.put(path, encoding);
+            }
+        }
+    }
+    
+    /**
      * Imports the content of the given directory into the repository as child nodes
      * of the given node. This will recursively add the complete subtree.
      * If a Node already exists in the repository, it will be overwritten.
@@ -91,16 +125,30 @@
                 // recursion:
                 importContent(file, childNode);
             } else {
+                if (name.equals(".meta")) {
+                    continue; // don't import the dump meta file
+                }
                 if (node.hasNode(name)) {
                     childNode = node.getNode(name);
                 } else {
                     childNode = node.addNode(name, NodeType.RESOURCE);
                 }
-                String mimeType = guessMimeType(FilenameUtils.getExtension(file.getName()));
+                String mimeType;
+                String path = getLocalPath(file);
+                if (this.mimeTypeMap.containsKey(path)) {
+                    mimeType = (String)this.mimeTypeMap.get(path);
+                } else {
+                    mimeType = guessMimeType(FilenameUtils.getExtension(file.getName()));
+                }
                 InputStream is = new FileInputStream(file);
                 OutputStream os = childNode.getOutputStream();
-                if (mimeType.equals("text/html")) {
-                    addIntrospectionLink(is, os);
+                if (mimeType.equals("text/html") || mimeType.equals("application/xhtml+xml")) {
+                    String encoding = "utf-8";
+                    if (this.encodingMap.containsKey(path)) {
+                        encoding = (String)this.encodingMap.get(path);
+                    }
+                    addIntrospectionLink(is, os, encoding);
+                    childNode.setEncoding(encoding);
                 } else {
                     byte[] buf = new byte[8192];
                     int bytesRead;
@@ -117,42 +165,64 @@
     }
     
     /**
+     * Returns the local path of a dumped file, i.e. the path relative
+     * to the root dir of the dump.
+     * @param file
+     * @return
+     * @throws IOException
+     */
+    protected String getLocalPath(File file) throws IOException {
+        String rootPath = new File(this.dumpDir).getCanonicalPath();
+        String path = file.getCanonicalPath();
+        if (!path.startsWith(rootPath)) {
+            throw new IOException("Path " + path + " must be inside of " + rootPath);
+        }
+        return path.substring(rootPath.length()+1);
+    }
+    
+    /**
      * Adds a yanel introspection link element to the head element of the current page.
-     * Note: this method is stream based and does not consider character encoding, therefore
-     * it works only for data with ascii-compatible encoding like utf-8 or iso-8859-1.
-     * TODO: remove existing introspection link if imported page already has one 
      * @param is stream of the source html page
      * @param os stream of the result html page
+     * @param encoding the encoding of the input stream
      * @throws IOException
      */
-    protected void addIntrospectionLink(InputStream is, OutputStream os) throws IOException {
-        int b;
+    protected void addIntrospectionLink(InputStream is, OutputStream os, String encoding) throws IOException {
+        int c;
         int state = OUTSIDE_TAG;
-        StringBuffer tagNameBuf = null;
-        while ((b = is.read()) != -1) {
+        InputStreamReader reader = new InputStreamReader(is, encoding);
+        OutputStreamWriter writer = new OutputStreamWriter(os, encoding);
+
+        StringBuffer tagBuf = null;
+        while ((c = reader.read()) != -1) {
             switch (state) {
             case OUTSIDE_TAG:
-                if (b == '<') {
-                    tagNameBuf = new StringBuffer();
+                if (c == '<') {
+                    tagBuf = new StringBuffer("<");
                     state = INSIDE_TAG;
+                } else {
+                    writer.write(c);
                 }
-                os.write(b);
                 break;
             case INSIDE_TAG:
-                os.write(b);
-                if (b == '>') {
+                tagBuf.append((char)c);
+                if (c == '>') {
                     state = OUTSIDE_TAG;
-                    String tagName = tagNameBuf.toString();
-                    if (tagName.startsWith("head")) {
+                    String tag = tagBuf.toString();
+                    if (!tag.startsWith("<link") || tag.indexOf("neutron-introspection") == -1) {
+                        writer.write(tag, 0, tag.length());
+                    }
+                    if (tag.startsWith("<head")) {
                         String introspectionLink = "<link rel=\"neutron-introspection\" type=\"application/neutron+xml\" href=\"?yanel.resource.usecase=introspection\"/>";
-                        os.write(introspectionLink.getBytes());
+                        writer.write(introspectionLink, 0, introspectionLink.length());
                     }
-                } else {
-                    tagNameBuf.append((char)b);
                 }
                 break;
             }
         }
+        writer.flush();
+        writer.close();
+        reader.close();
     }
     
     /**




More information about the Yanel-commits mailing list