[Yanel-commits] rev 23384 - in public/yanel/trunk/src/resources/add-realm/src: build java/org/wyona/yanel/impl/resources

josias at wyona.com josias at wyona.com
Fri Mar 23 17:05:55 CET 2007


Author: josias
Date: 2007-03-23 17:05:53 +0100 (Fri, 23 Mar 2007)
New Revision: 23384

Modified:
   public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
   public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java
   public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
   public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
Log:
some minor improvements/bugfixes (don't collect all link events to save memory, correctly create redirect to first page, show 'import completed' only when the import is really finished) and updated to new crawler version

Modified: public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml	2007-03-23 15:16:20 UTC (rev 23383)
+++ public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml	2007-03-23 16:05:53 UTC (rev 23384)
@@ -14,7 +14,7 @@
 
     <artifact:dependencies pathId="maven2.resource.classpath" filesetId="maven2.resource.fileset">
       <remoteRepository refid="wyona.remote.repository"/>
-      <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23346"/>
+      <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23383"/>
       <dependency groupId="apache-jakarta-commons" artifactId="apache-jakarta-commons-io" version="1.2"/>
 
     </artifact:dependencies>

Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java	2007-03-23 15:16:20 UTC (rev 23383)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java	2007-03-23 16:05:53 UTC (rev 23384)
@@ -323,7 +323,8 @@
         session.setAttribute(SESSION_ATTR_REALM_NAME, realm.getName());
         
         // start crawler in new thread to be able to show progress:
-        ImportSiteThread thread = new ImportSiteThread(crawler, realm, dumpDir, crawlStartURL);
+        ImportSiteThread thread = new ImportSiteThread(crawler, realm, dumpDir, crawlStartURL, 
+                crawlScopeURLs, eventLog);
         thread.start();
     }
     

Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java	2007-03-23 15:16:20 UTC (rev 23383)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/EventLog.java	2007-03-23 16:05:53 UTC (rev 23384)
@@ -32,12 +32,21 @@
      */
     public void crawled (LinkEvent event) {
         if (event.getID() == LinkEvent.DOWNLOADED) {
-            this.downloadEvents.add(event);
+            this.downloadEvents.add(createEventDescription(event));
         } else if (event.getID() == LinkEvent.ERROR) {
-            this.errorEvents.add(event);
+            this.errorEvents.add(createEventDescription(event));
         }
     }
     
+    private String createEventDescription(LinkEvent event) {
+        String desc = event.toString().replaceAll(">", "&gt;");
+        desc = desc.replaceAll("<", "&lt;");
+        desc = desc.replaceAll("'", "&apos;");
+        desc = desc.replaceAll("\"", "&quot;");
+        desc = desc.replaceAll("&", "&amp;");
+        return desc;
+    }
+    
     /**
      * Returns all download events.
      * @return
@@ -45,13 +54,7 @@
     public String getDownloadEvents() {
         StringBuffer buf = new StringBuffer();
         for (int i=0; i<this.downloadEvents.size(); i++) {
-            LinkEvent event = (LinkEvent)this.downloadEvents.get(i);
-            String desc = event.toString().replaceAll(">", "&gt;");
-            desc = desc.replaceAll("<", "&lt;");
-            desc = desc.replaceAll("'", "&apos;");
-            desc = desc.replaceAll("\"", "&quot;");
-            desc = desc.replaceAll("&", "&amp;");
-            buf.append(desc + "\n");
+            buf.append(this.downloadEvents.get(i) + "\n");
         }
         return buf.toString();
     }
@@ -86,12 +89,15 @@
     }
 
     public void stopped(CrawlEvent event) {
-        this.isDone = true;
     }
 
     public void timedOut(CrawlEvent event) {
     }
     
+    public void importFinished() {
+        this.isDone = true;
+    }
+    
     public boolean isDone() {
         return this.isDone;
     }

Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java	2007-03-23 15:16:20 UTC (rev 23383)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/ImportSiteThread.java	2007-03-23 16:05:53 UTC (rev 23384)
@@ -40,18 +40,23 @@
     private DumpingCrawler crawler;
     private String dumpDir;
     private String crawlStartURL;
+    private String[] crawlScopeURLs;
     private Realm realm;
+    private EventLog eventLog;
     
     private HashMap mimeTypeMap;
     private HashMap encodingMap;
     
-    public ImportSiteThread(DumpingCrawler crawler, Realm realm, String dumpDir, String crawlStartURL) {
+    public ImportSiteThread(DumpingCrawler crawler, Realm realm, String dumpDir, 
+            String crawlStartURL, String[] crawlScopeURLs, EventLog eventLog) {
         this.crawler = crawler;
         this.realm = realm;
         this.dumpDir = dumpDir;
         this.crawlStartURL = crawlStartURL;
+        this.crawlScopeURLs = crawlScopeURLs;
         this.mimeTypeMap = new HashMap();
         this.encodingMap = new HashMap();
+        this.eventLog = eventLog;
     }
     
     public void run() {
@@ -70,8 +75,10 @@
             // remove temp dump dir
             FileUtils.deleteDirectory(new File(dumpDir));
             
-            fixRootNode(crawlStartURL, root);
+            fixRootNode(crawlStartURL, crawlScopeURLs[0], root);
             addResourceConfiguration(realm.getRTIRepository());
+            
+            eventLog.importFinished();
         } catch (Exception e) {
             log.error(e, e);
             throw new RuntimeException(e.toString(), e);
@@ -237,16 +244,18 @@
      * @param root
      * @throws RepositoryException 
      */
-    protected void fixRootNode(String crawlStartURL, Node root) {
+    protected void fixRootNode(String crawlStartURL, String crawlScopeURL, Node root) {
         try {
             URL url = new URL(crawlStartURL);
             String path = url.getPath();
-            String crawlRoot = null;
+            String crawlRoot = crawlStartURL.substring(crawlScopeURL.length());
             if (path.length() == 0 || path.endsWith("/")) {
                 crawlRoot = "index.html";
-            } else if (path.indexOf("/") > -1) {
-                crawlRoot = path.substring(path.lastIndexOf("/") + 1);
             }
+            if (crawlRoot.startsWith("/")) {
+                crawlRoot = crawlRoot.substring(1);
+            }
+            
             log.debug("crawlRoot: " + crawlRoot);
             if (crawlRoot != null && root.hasNode(crawlRoot)) {
                 PrintWriter writer = new PrintWriter(new OutputStreamWriter(root.getOutputStream()));
@@ -259,6 +268,8 @@
                 writer.println("</html>");
                 writer.flush();
                 writer.close();
+            } else {
+                log.error("crawl root node " + crawlRoot + " does not exist");
             }
         } catch (Exception e) {
             log.error(e, e);




More information about the Yanel-commits mailing list