[Yanel-commits] rev 44020 - public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search

michi at wyona.com michi at wyona.com
Wed Aug 5 16:26:33 CEST 2009


Author: michi
Date: 2009-08-05 16:26:33 +0200 (Wed, 05 Aug 2009)
New Revision: 44020

Added:
   public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java
Modified:
   public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java
Log:
title content handler introduced

Modified: public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java
===================================================================
--- public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java	2009-08-05 14:17:35 UTC (rev 44019)
+++ public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java	2009-08-05 14:26:33 UTC (rev 44020)
@@ -207,9 +207,11 @@
                     // NOTE: The tika meta data must not be null, hence we just declare something
                     org.apache.tika.metadata.Metadata tikaMetaData = new org.apache.tika.metadata.Metadata();
                     tikaMetaData.set("yarep:path", path);
-                    parser.parse(in, new org.apache.tika.sax.BodyContentHandler(writer), tikaMetaData);
+                    parser.parse(in, new TitleContentHandler(writer), tikaMetaData);
+                    //parser.parse(in, new org.apache.tika.sax.BodyContentHandler(writer), tikaMetaData);
                     //parser.parse(in, new org.apache.tika.sax.WriteOutContentHandler(writer), tikaMetaData);
                     log.warn("DEBUG: Title: " + writer.toString());
+                    return writer.toString();
                 } catch (Exception e) {
                     log.error(e, e);
                 }

Added: public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java
===================================================================
--- public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java	                        (rev 0)
+++ public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java	2009-08-05 14:26:33 UTC (rev 44020)
@@ -0,0 +1,75 @@
+/**
+ * Licensed to Wyona
+ */
+package org.wyona.yanel.impl.resources.search;
+
+import java.io.OutputStream;
+import java.io.Writer;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Content handler decorator that only passes everything inside
+ * the XHTML <title/> tag to the underlying handler. Note that
+ * the &lt;title/&gt; tag itself is <em>not</em> passed on.
+ */
+public class TitleContentHandler extends ContentHandlerDecorator {
+
+    /**
+     * XHTML XPath parser.
+     */
+    private static final XPathParser PARSER =
+        new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+
+    /**
+     * The XPath matcher used to select the XHTML title contents.
+     */
+    private static final Matcher MATCHER =
+        PARSER.parse("/xhtml:html/xhtml:head/xhtml:title/descendant:node()");
+
+    /**
+     * Creates a content handler that passes all XHTML title events to the
+     * given underlying content handler.
+     *
+     * @param handler content handler
+     */
+    public TitleContentHandler(ContentHandler handler) {
+        super(new MatchingContentHandler(handler, MATCHER));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML title character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public TitleContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML title character events to
+     * the given output stream using the default encoding.
+     *
+     * @param stream output stream
+     */
+    public TitleContentHandler(OutputStream stream) {
+        this(new WriteOutContentHandler(stream));
+    }
+
+    /**
+     * Creates a content handler that writes XHTML title character events to
+     * an internal string buffer. The contents of the buffer can be retrieved
+     * using the {@link #toString()} method.
+     */
+    public TitleContentHandler() {
+        this(new WriteOutContentHandler());
+    }
+
+}



More information about the Yanel-commits mailing list