[Yanel-commits] rev 44020 -
public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search
michi at wyona.com
michi at wyona.com
Wed Aug 5 16:26:33 CEST 2009
Author: michi
Date: 2009-08-05 16:26:33 +0200 (Wed, 05 Aug 2009)
New Revision: 44020
Added:
public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java
Modified:
public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java
Log:
title content handler introduced
Modified: public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java
===================================================================
--- public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java 2009-08-05 14:17:35 UTC (rev 44019)
+++ public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/SearchResource.java 2009-08-05 14:26:33 UTC (rev 44020)
@@ -207,9 +207,11 @@
// NOTE: The tika meta data must not be null, hence we just declare something
org.apache.tika.metadata.Metadata tikaMetaData = new org.apache.tika.metadata.Metadata();
tikaMetaData.set("yarep:path", path);
- parser.parse(in, new org.apache.tika.sax.BodyContentHandler(writer), tikaMetaData);
+ parser.parse(in, new TitleContentHandler(writer), tikaMetaData);
+ //parser.parse(in, new org.apache.tika.sax.BodyContentHandler(writer), tikaMetaData);
//parser.parse(in, new org.apache.tika.sax.WriteOutContentHandler(writer), tikaMetaData);
log.warn("DEBUG: Title: " + writer.toString());
+ return writer.toString();
} catch (Exception e) {
log.error(e, e);
}
Added: public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java
===================================================================
--- public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java (rev 0)
+++ public/yanel/trunk/src/contributions/resources/search/src/java/org/wyona/yanel/impl/resources/search/TitleContentHandler.java 2009-08-05 14:26:33 UTC (rev 44020)
@@ -0,0 +1,75 @@
+/**
+ * Licensed to Wyona
+ */
+package org.wyona.yanel.impl.resources.search;
+
+import java.io.OutputStream;
+import java.io.Writer;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Content handler decorator that only passes everything inside
+ * the XHTML <title/> tag to the underlying handler. Note that
+ * the <title/> tag itself is <em>not</em> passed on.
+ */
+public class TitleContentHandler extends ContentHandlerDecorator {
+
+ /**
+ * XHTML XPath parser.
+ */
+ private static final XPathParser PARSER =
+ new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+
+ /**
+ * The XPath matcher used to select the XHTML title contents.
+ */
+ private static final Matcher MATCHER =
+ PARSER.parse("/xhtml:html/xhtml:head/xhtml:title/descendant:node()");
+
+ /**
+ * Creates a content handler that passes all XHTML title events to the
+ * given underlying content handler.
+ *
+ * @param handler content handler
+ */
+ public TitleContentHandler(ContentHandler handler) {
+ super(new MatchingContentHandler(handler, MATCHER));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML title character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public TitleContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML title character events to
+ * the given output stream using the default encoding.
+ *
+ * @param stream output stream
+ */
+ public TitleContentHandler(OutputStream stream) {
+ this(new WriteOutContentHandler(stream));
+ }
+
+ /**
+ * Creates a content handler that writes XHTML title character events to
+ * an internal string buffer. The contents of the buffer can be retrieved
+ * using the {@link #toString()} method.
+ */
+ public TitleContentHandler() {
+ this(new WriteOutContentHandler());
+ }
+
+}
More information about the Yanel-commits
mailing list