[Yanel-commits] rev 23205 - in
public/yanel/trunk/src/resources/add-realm/src: build
java/org/wyona/yanel/impl/resources
josias at wyona.com
josias at wyona.com
Fri Mar 9 00:28:32 CET 2007
Author: josias
Date: 2007-03-09 00:28:31 +0100 (Fri, 09 Mar 2007)
New Revision: 23205
Modified:
public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java
Log:
added crawler to dump and import an external site into a new realm
Modified: public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml 2007-03-08 23:25:50 UTC (rev 23204)
+++ public/yanel/trunk/src/resources/add-realm/src/build/dependencies.xml 2007-03-08 23:28:31 UTC (rev 23205)
@@ -14,10 +14,13 @@
<artifact:dependencies pathId="maven2.resource.classpath" filesetId="maven2.resource.fileset">
<remoteRepository refid="wyona.remote.repository"/>
- <!-- No resource specific libs yet -->
+ <dependency groupId="wyona-org-crawler" artifactId="wyona-org-crawler" version="0.0.1-dev-r23135"/>
+ <dependency groupId="apache-jakarta-commons" artifactId="apache-jakarta-commons-io" version="1.2"/>
+
</artifact:dependencies>
<property name="maven2.cp" refid="maven2.classpath"/>
+ <property name="maven2.resource.cp" refid="maven2.resource.classpath"/>
<!--<echo>Maven2 classpath: ${maven2.cp}</echo>-->
</target>
Modified: public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java
===================================================================
--- public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java 2007-03-08 23:25:50 UTC (rev 23204)
+++ public/yanel/trunk/src/resources/add-realm/src/java/org/wyona/yanel/impl/resources/AddRealmResource.java 2007-03-08 23:28:31 UTC (rev 23205)
@@ -9,18 +9,35 @@
import org.wyona.yanel.core.api.attributes.ViewableV2;
import org.wyona.yanel.core.attributes.viewable.View;
import org.wyona.yanel.core.attributes.viewable.ViewDescriptor;
+import org.wyona.yanel.core.map.Realm;
import org.wyona.yanel.core.transformation.I18nTransformer;
import org.wyona.yanel.core.util.HttpServletRequestHelper;
+import org.wyona.yarep.core.Node;
+import org.wyona.yarep.core.NodeType;
+import org.wyona.yarep.core.Repository;
+import org.wyona.yarep.core.RepositoryException;
import org.wyona.yanel.core.util.PathUtil;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.lenya.search.crawler.DumpingCrawler;
import org.apache.log4j.Category;
+import websphinx.EventLog;
+
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.URL;
import java.util.HashMap;
import java.util.Enumeration;
import java.util.Iterator;
@@ -45,6 +62,9 @@
private static final String CRAWLER_JAR = "yanel-crawler.jar";
private static Category log = Category.getInstance(AddRealmResource.class);
+ private final static int INSIDE_TAG = 0;
+ private final static int OUTSIDE_TAG = 1;
+
private String defaultLanguage = "en";
private String language = null;
private String parameterName = null;
@@ -182,6 +202,15 @@
fslocationValue);
transformer.setParameter("submitted", "true");
transformer.setParameter("yanel.back2context", PathUtil.backToContext(realm, getPath()));
+
+ String crawlStartURL = (String)parameters.get("url");
+ if (crawlStartURL != null && crawlStartURL.length() > 0) {
+ int maxPages = Integer.parseInt((String)parameters.get("crawlmaxpages"));
+ int maxDepth = Integer.parseInt((String)parameters.get("crawldepth"));
+ String realmID = parameters.get("realmid").toString();
+
+ importSite(crawlStartURL, maxPages, maxDepth, realmID);
+ }
}
transformer.transform(new javax.xml.transform.stream.StreamSource(statusXMLFile), new StreamResult(byteArrayOutputStream));
@@ -210,6 +239,227 @@
return defaultView;
}
+ /**
+ * Crawls and external site and imports it into a realm.
+ * @param crawlStartURL
+ * @param maxPages
+ * @param maxDepth
+ * @param realmID
+ * @throws Exception
+ */
+ protected void importSite(String crawlStartURL, int maxPages, int maxDepth, String realmID) throws Exception {
+ String crawlScopeURL = crawlStartURL;
+ URL url = new URL(crawlStartURL);
+ String path = url.getPath();
+ if (path.length() != 0 && !path.endsWith("/") && path.indexOf("/") > -1) {
+ crawlScopeURL = crawlStartURL.substring(0, crawlStartURL.lastIndexOf("/"));
+ }
+
+ String dumpDir = System.getProperty("java.io.tmpdir") + File.separator + "import_" + System.currentTimeMillis();
+ DumpingCrawler crawler = new DumpingCrawler(crawlStartURL, crawlScopeURL, dumpDir);
+ crawler.setMaxPages(maxPages);
+ crawler.setMaxDepth(maxDepth);
+
+ EventLog eventLog = new EventLog(System.out);
+ crawler.addCrawlListener(eventLog);
+ crawler.addLinkListener(eventLog);
+
+ // create dump:
+ // TODO: start crawler in thread and show progress
+ crawler.run();
+ crawler.close();
+
+ // import dump into realm:
+ Realm realm = getYanel().getRealmConfiguration().getRealm(realmID);
+ deleteRepositoryContent(realm.getRepository());
+ deleteRepositoryContent(realm.getRTIRepository());
+ Node root = realm.getRepository().getRootNode();
+ importContent(new File(dumpDir), root);
+
+ // remove temp dump dir
+ FileUtils.deleteDirectory(new File(dumpDir));
+
+ fixRootNode(crawlStartURL, root);
+ addResourceConfiguration(realm.getRTIRepository());
+ }
+
+ /**
+ * Imports the content of the given directory into the repository as child nodes
+ * of the given node. This will recursively add the complete subtree.
+ * If a Node already exists in the repository, it will be overwritten.
+ * @param dir
+ * @param node
+ * @throws IOException
+ * @throws RepositoryException
+ */
+ protected void importContent(File dir, Node node) throws IOException, RepositoryException{
+ File[] children = dir.listFiles();
+ for (int i=0; i<children.length; i++) {
+ File file = children[i];
+ String name = file.getName();
+ Node childNode;
+ if (file.isDirectory()) {
+ if (node.hasNode(name)) {
+ childNode = node.getNode(name);
+ } else {
+ childNode = node.addNode(name, NodeType.COLLECTION);
+ }
+ // recursion:
+ importContent(file, childNode);
+ } else {
+ if (node.hasNode(name)) {
+ childNode = node.getNode(name);
+ } else {
+ childNode = node.addNode(name, NodeType.RESOURCE);
+ }
+ String mimeType = guessMimeType(FilenameUtils.getExtension(file.getName()));
+ InputStream is = new FileInputStream(file);
+ OutputStream os = childNode.getOutputStream();
+ if (mimeType.equals("text/html")) {
+ addIntrospectionLink(is, os);
+ } else {
+ byte[] buf = new byte[8192];
+ int bytesRead;
+ while ((bytesRead = is.read(buf)) != -1) {
+ os.write(buf, 0, bytesRead);
+ }
+ }
+ os.flush();
+ os.close();
+ is.close();
+ childNode.setMimeType(mimeType);
+ }
+ }
+ }
+
+ /**
+ * Adds a yanel introspection link element to the head element of the current page.
+ * Note: this method is stream based and does not consider encoding.
+ * It only works because all relevant characters are ascii.
+ * @param is stream of the source html page
+ * @param os stream of the result html page
+ * @throws IOException
+ */
+ protected void addIntrospectionLink(InputStream is, OutputStream os) throws IOException {
+ int b;
+ int state = OUTSIDE_TAG;
+ StringBuffer tagNameBuf = null;
+ while ((b = is.read()) != -1) {
+ switch (state) {
+ case OUTSIDE_TAG:
+ if (b == '<') {
+ tagNameBuf = new StringBuffer();
+ state = INSIDE_TAG;
+ }
+ os.write(b);
+ break;
+ case INSIDE_TAG:
+ os.write(b);
+ if (b == '>') {
+ state = OUTSIDE_TAG;
+ String tagName = tagNameBuf.toString();
+ if (tagName.startsWith("head")) {
+ String introspectionLink = "<link rel=\"neutron-introspection\" type=\"application/neutron+xml\" href=\"?yanel.resource.usecase=introspection\"/>";
+ os.write(introspectionLink.getBytes());
+ }
+ } else {
+ tagNameBuf.append((char)b);
+ }
+ break;
+ }
+ }
+ }
+
+ /**
+ * Creates a redirect from the repository root node to the crawl root page.
+ * This is necessary to make the root page of the crawl
+ * accessible at root url of the new realm.
+ * Example:
+ * crawlStartURL: http://foo.bar/start.html
+ * new realm id: foo-realm
+ * -> /foo-realm/ will redirect to /foo-realm/start.html
+ * @param crawlStartURL
+ * @param root
+ * @throws RepositoryException
+ */
+ protected void fixRootNode(String crawlStartURL, Node root) {
+ try {
+ URL url = new URL(crawlStartURL);
+ String path = url.getPath();
+ String crawlRoot = null;
+ if (path.length() == 0 || path.endsWith("/")) {
+ crawlRoot = "index.html";
+ } else if (path.indexOf("/") > -1) {
+ crawlRoot = path.substring(path.lastIndexOf("/") + 1);
+ }
+ log.debug("crawlRoot: " + crawlRoot);
+ if (crawlRoot != null && root.hasNode(crawlRoot)) {
+ PrintWriter writer = new PrintWriter(new OutputStreamWriter(root.getOutputStream()));
+ writer.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">");
+ writer.println("<html>");
+ writer.println("<head>");
+ writer.println("<meta http-equiv=\"refresh\" content=\"0; url=" + crawlRoot + "\"/>");
+ writer.println("</head>");
+ writer.println("<body/>");
+ writer.println("</html>");
+ writer.flush();
+ writer.close();
+ }
+ } catch (Exception e) {
+ log.error(e, e);
+ // ignore
+ }
+ }
+
+ /**
+ * Adds a resource configuration file for the root node of the repository.
+ * @param repository
+ * @throws RepositoryException
+ */
+ protected void addResourceConfiguration(Repository repository) throws RepositoryException {
+ Node node = repository.getRootNode().addNode(".yanel-rc", NodeType.RESOURCE);
+ PrintWriter writer = new PrintWriter(new OutputStreamWriter(node.getOutputStream()));
+ writer.println("<?xml version=\"1.0\"?>");
+ writer.println("<yanel:resource-config xmlns:yanel=\"http://www.wyona.org/yanel/rti/1.0\">");
+ writer.println("<yanel:rti name=\"file\" namespace=\"http://www.wyona.org/yanel/resource/1.0\"/>");
+ writer.println("<yanel:property name=\"mime-type\" value=\"text/html\"/>");
+ writer.println("</yanel:resource-config>");
+ writer.flush();
+ writer.close();
+ }
+
+ /**
+ * Delete all nodes from the repository except the root node.
+ * @param repository
+ * @throws RepositoryException
+ */
+ protected void deleteRepositoryContent(Repository repository) throws RepositoryException {
+ Node[] children = repository.getRootNode().getNodes();
+ for (int i=0; i<children.length; i++) {
+ children[i].delete();
+ }
+ }
+
+ /**
+ * Returns the mime-type according to the given file extension.
+ * Default is application/octet-stream.
+ * @param extension
+ * @return
+ */
+ protected String guessMimeType(String extension) {
+ String ext = extension.toLowerCase();
+ if (ext.equals("html") || ext.equals("htm")) return "text/html";
+ if (ext.equals("css")) return "text/css";
+ if (ext.equals("txt")) return "text/plain";
+ if (ext.equals("js")) return "application/x-javascript";
+ if (ext.equals("jpg") || ext.equals("jpg")) return "image/jpeg";
+ if (ext.equals("gif")) return "image/gif";
+ if (ext.equals("pdf")) return "application/pdf";
+ if (ext.equals("zip")) return "application/zip";
+ //TODO: add more
+ return "application/octet-stream"; // default
+ }
+
/* TODO: add showProgress
private View showProgress(Path path, View defaultView) throws Exception {
More information about the Yanel-commits
mailing list