From e8c6469cb924440df13d276bf6714c484db6b124 Mon Sep 17 00:00:00 2001
From: Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de>
Date: Tue, 3 Sep 2013 18:35:25 +0200
Subject: [PATCH] Refactored towards HTMLWriter. Need tests and cleanup.

---
 .../services/aggregator/html/HTML.java        | 103 ++-----
 .../services/aggregator/html/HTMLWriter.java  | 265 ++++++++++++++++++
 2 files changed, 281 insertions(+), 87 deletions(-)
 create mode 100644 src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java

diff --git a/src/main/java/info/textgrid/services/aggregator/html/HTML.java b/src/main/java/info/textgrid/services/aggregator/html/HTML.java
index 881204f..ad7f358 100644
--- a/src/main/java/info/textgrid/services/aggregator/html/HTML.java
+++ b/src/main/java/info/textgrid/services/aggregator/html/HTML.java
@@ -1,22 +1,17 @@
 package info.textgrid.services.aggregator.html;
 
-import info.textgrid.namespaces.metadata.core._2010.MetadataContainerType;
 import info.textgrid.namespaces.metadata.core._2010.ObjectType;
 import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.AuthFault;
 import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.IoFault;
 import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.MetadataParseFault;
 import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ObjectNotFoundFault;
 import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ProtocolNotImplementedFault;
-import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.TGCrudService;
-import info.textgrid.services.aggregator.GenericExceptionMapper;
 import info.textgrid.services.aggregator.ITextGridRep;
 import info.textgrid.services.aggregator.ITextGridRep.TGOSupplier;
 import info.textgrid.services.aggregator.TextGridRepProvider;
-import info.textgrid.services.aggregator.teicorpus.TEICorpusSerializer;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.OutputStream;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.net.URL;
@@ -33,27 +28,22 @@
 import javax.ws.rs.QueryParam;
 import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.Context;
-import javax.ws.rs.core.Response.Status;
 import javax.ws.rs.core.StreamingOutput;
 import javax.xml.transform.stream.StreamSource;
 
 import net.sf.saxon.s9api.Processor;
-import net.sf.saxon.s9api.QName;
 import net.sf.saxon.s9api.SaxonApiException;
-import net.sf.saxon.s9api.XdmAtomicValue;
 import net.sf.saxon.s9api.XsltCompiler;
 import net.sf.saxon.s9api.XsltExecutable;
-import net.sf.saxon.s9api.XsltTransformer;
 
 import org.apache.cxf.jaxrs.model.wadl.Description;
 
-import com.google.common.base.Stopwatch;
+import com.google.common.base.Optional;
 import com.google.common.cache.Cache;
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.RemovalListener;
 import com.google.common.cache.RemovalNotification;
-import com.google.common.io.FileBackedOutputStream;
 
 @Path("/html")
 @Description("Creates an HTML representation of the given TEI document, or aggregation of TEI documents. This is currently extremely experimental and probably broken.")
@@ -62,20 +52,20 @@ public class HTML {
 
 	private static final String TO_HTML_XSL = "/WEB-INF/stylesheets/db2xhtml.xsl";
 
-	private ITextGridRep repository = TextGridRepProvider.getInstance();
+	ITextGridRep repository = TextGridRepProvider.getInstance();
 
 	final Logger logger = Logger
 			.getLogger("info.textgrid.services.aggregator.html.HTML");
 
 	private XsltExecutable toHtml;
-	private final Processor xsltProcessor;
+	final Processor xsltProcessor;
 
 	@Context
 	private ServletContext servlet;
 
 	private Cache<URI, XsltExecutable> stylesheets;
 
-	private XsltExecutable getToHtml() {
+	XsltExecutable getToHtml() {
 		if (toHtml == null) {
 			try {
 				final URL stylesheet = servlet.getResource(TO_HTML_XSL);
@@ -148,15 +138,16 @@ public XsltExecutable load(final URI url) throws Exception {
 	 * @param uri
 	 *            the URI of the stylesheet to load
 	 * @param sid
-	 *            the session ID to use, or null.
+	 *            the session ID to use, if present
 	 * @param forceLoad
-	 *            TODO
+	 *            do not use a cached version even if present.
 	 * @throws IOException
 	 *             if an error occurs reading the stylesheet.
 	 * @throws SaxonApiException
 	 *             if saxon fails to compile the stylesheet.
 	 */
-	protected XsltExecutable getStylesheet(final URI uri, final String sid,
+	protected XsltExecutable getStylesheet(final URI uri,
+			final Optional<String> sid,
 			final boolean forceLoad) throws SaxonApiException, IOException {
 		XsltExecutable executable = null;
 		
@@ -171,7 +162,8 @@ protected XsltExecutable getStylesheet(final URI uri, final String sid,
 			if (TGUriResolver.isResolveable(uri)) {
 
 				// (2/3) it's a TextGrid object, load it from TG-crud.
-				final TGOSupplier<InputStream> xsltSupplier = repository.read(uri, sid);
+				final TGOSupplier<InputStream> xsltSupplier = repository.read(
+						uri, sid.orNull());
 				executable = compiler.compile(new StreamSource(xsltSupplier
 						.getInput(), uri.toString()));
 
@@ -222,74 +214,11 @@ public StreamingOutput get(
 			SaxonApiException, ExecutionException {
 		logger.fine("HTML called for root object: " + uri);
 
-		final Stopwatch stopwatch = new Stopwatch();
-		stopwatch.start();
-
-		final TGCrudService crud = repository.getCRUDService();
-		final MetadataContainerType container = crud.readMetadata(sid, null,
-				uri.toString());
-		final ObjectType rootObject = container.getObject();
-		final String mimeType = rootObject.getGeneric().getProvided().getFormat();
-		final boolean aggregation = mimeType.contains("aggregation");
-		if (!aggregation && !mimeType.matches("^text/.*xml.*")) {
-			final String errorMsg = MessageFormat.format("The HTML export can only convert aggregations or XML documents to EPUB, however, the document {0} you referred to has the MIME type {1}.", uri, mimeType);
-			throw new WebApplicationException(
-					GenericExceptionMapper.toResponse(
-							Status.UNSUPPORTED_MEDIA_TYPE, errorMsg, ""));
-		}
-		final InputStream tei;
-		if (aggregation) {
-			final TEICorpusSerializer corpusSerializer = new TEICorpusSerializer(
-					rootObject, false, sid);
-			final FileBackedOutputStream corpusBuffer = new FileBackedOutputStream(
-					1024 * 1024, true);
-			corpusSerializer.write(corpusBuffer);
-			corpusBuffer.close();
-			tei = corpusBuffer.getSupplier().getInput();
-			logger.fine("  created intermediate corpus for " + uri);
-		} else {
-			tei = repository.getContent(uri, sid);
-		}
-		logger.info("we have an input document after " + stopwatch.toString());
-
-		final XsltTransformer transformer;
-		if (xsluri == null || "".equals(xsluri)) {
-			transformer = getToHtml().load();
-		} else {
-			transformer = getStylesheet(xsluri, sid, refreshStylesheet).load();
-			if (sid != null) {
-				transformer.setURIResolver(new TGUriResolver(repository, sid));
-			} // otherwise default public URI resolver
-		}
-
-		transformer.setSource(new StreamSource(tei));
-		transformer.setParameter(new QName("graphicsURLPattern"),
-				new XdmAtomicValue(repository.getCRUDRestEndpoint()
-						+ "/@URI@/data"
-						+ ((sid == null || "".equals(sid)) ? ""
-								: ("?sessionId=" + sid))));
-		if (css != null) {
-			transformer.setParameter(new QName("cssFile"), new XdmAtomicValue(css));
-		}
-
-		logger.info("we're ready to transform after " + stopwatch.toString());
-		return new StreamingOutput() {
-
-			@Override
-			public void write(final OutputStream output) throws IOException,
-					WebApplicationException {
-				transformer.setDestination(xsltProcessor.newSerializer(output));
-				try {
-					transformer.transform();
-					logger.info(MessageFormat
-							.format("Finished transformation to HTML for {0} after {1}",
-									uri, stopwatch.toString()));
-				} catch (final SaxonApiException e) {
-					throw new WebApplicationException(e);
-				}
-			}
-		};
-			
-
+		final HTMLWriter writer = new HTMLWriter(this, uri).stylesheet(xsluri)
+				.sid(sid).refresh(refreshStylesheet).css(css);
+		writer.loadSource();
+		writer.loadStylesheet();
+		return writer;
 	}
+
 }
diff --git a/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java b/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java
new file mode 100644
index 0000000..b52d46a
--- /dev/null
+++ b/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java
@@ -0,0 +1,265 @@
+package info.textgrid.services.aggregator.html;
+
+import info.textgrid.namespaces.metadata.core._2010.ObjectType;
+import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.AuthFault;
+import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.IoFault;
+import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.MetadataParseFault;
+import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ObjectNotFoundFault;
+import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ProtocolNotImplementedFault;
+import info.textgrid.services.aggregator.GenericExceptionMapper;
+import info.textgrid.services.aggregator.ITextGridRep;
+import info.textgrid.services.aggregator.ITextGridRep.TGOSupplier;
+import info.textgrid.services.aggregator.teicorpus.TEICorpusSerializer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.text.MessageFormat;
+
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Response.Status;
+import javax.ws.rs.core.StreamingOutput;
+import javax.xml.transform.Source;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.TransformerFactoryConfigurationError;
+import javax.xml.transform.stream.StreamSource;
+
+import net.sf.saxon.s9api.QName;
+import net.sf.saxon.s9api.SaxonApiException;
+import net.sf.saxon.s9api.XdmAtomicValue;
+import net.sf.saxon.s9api.XsltExecutable;
+import net.sf.saxon.s9api.XsltTransformer;
+
+import com.google.common.base.Optional;
+import com.google.common.base.Throwables;
+import com.google.common.io.ByteStreams;
+import com.google.common.io.FileBackedOutputStream;
+
+/**
+ * The essential steps:
+ * 
+ * <ol>
+ * <li>Construct the transformation source.
+ * <p>
+ * Depends on different factors: If it's an aggregation, we need to recursively
+ * construct a teiCorpus first. If we need to parse the
+ * {@code <?xsl-stylesheet?>} processing instruction, we need to at least
+ * partially parse the document twice, so we download it to a temporary buffer.
+ * Otherwise, the source could be the result stream of the call to TG-crud.
+ * </p>
+ * <p>
+ * Includes some, potentially longish, requests to TextGridRep or other sources
+ * that may cause exceptions.
+ * </p>
+ * </li>
+ * <li>Load and configure the stylesheet.
+ * <p>
+ * Typically, the order is argument-specified > associated (via pi) > default
+ * stylesheet, with the associated stylesheet only checked on request.
+ * </p>
+ * </li>
+ * <li>Perform the transformation and output the result.
+ * <p>
+ * This step essentially requires the output stream to be present.
+ * </p>
+ * </li>
+ * </ol>
+ * 
+ * @author Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de>
+ * 
+ */
+public class HTMLWriter implements StreamingOutput {
+
+	private final HTML service;
+
+	// Options
+	private final URI rootURI;
+	private Optional<URI> stylesheetURI = Optional.absent();
+	private boolean refreshStylesheet = false;
+	private boolean tryEmbeddedStylesheet = false;
+	private Optional<URI> css = Optional.absent();
+
+	// detected and extracted
+	private enum SourceType {
+		UNKNOWN, XML, AGGREGATION
+	}
+
+	private SourceType sourceType = SourceType.UNKNOWN;
+
+	private final ITextGridRep repository;
+
+	private Optional<String> sid;
+
+	private ObjectType metadata;
+
+	private Optional<URI> associatedStylesheet = Optional.absent();
+
+	private Source source = null;
+
+	private XsltTransformer transformer;
+
+	// Constructor and configuration
+
+	public HTMLWriter(final HTML service, final URI rootURI) {
+		this.service = service;
+		this.rootURI = rootURI;
+		this.repository = service.repository;
+	}
+
+	public HTMLWriter sid(final String sid) {
+		if (sid == null || sid.isEmpty()) {
+			this.sid = Optional.absent();
+		} else {
+			this.sid = Optional.of(sid);
+		}
+		return this;
+	}
+
+	public HTMLWriter stylesheet(final URI uri) {
+		this.stylesheetURI = Optional.fromNullable(uri);
+		return this;
+	}
+
+	public HTMLWriter refresh(final boolean refresh) {
+		this.refreshStylesheet = refresh;
+		return this;
+	}
+
+	public HTMLWriter embedded(final boolean embedded) {
+		this.tryEmbeddedStylesheet = embedded;
+		if (embedded) {
+			this.sourceType = SourceType.XML;
+		}
+		return this;
+	}
+
+	public HTMLWriter css(final URI css) {
+		this.css = Optional.fromNullable(css);
+		return this;
+	}
+
+	protected HTMLWriter loadSource() throws ObjectNotFoundFault,
+			MetadataParseFault,
+			IoFault, ProtocolNotImplementedFault, AuthFault, IOException {
+
+		TGOSupplier<InputStream> content = null;
+		content = service.repository.read(rootURI, sid.orNull());
+		metadata = content.getMetadata();
+		final String format = metadata.getGeneric().getProvided().getFormat();
+		if (format.contains("aggregation")) {
+			sourceType = SourceType.AGGREGATION;
+		} else if (format.matches("^text/.*xml.*$")) {
+			sourceType = SourceType.XML;
+		} else {
+			final String errorMsg = MessageFormat
+					.format("The HTML export can only convert aggregations or XML documents to EPUB, however, the document {0} you referred to has the MIME type {1}.",
+							rootURI, format);
+			throw new WebApplicationException(
+					GenericExceptionMapper.toResponse(
+							Status.UNSUPPORTED_MEDIA_TYPE, errorMsg, ""));
+		}
+
+		if (sourceType == SourceType.AGGREGATION) {
+			final TEICorpusSerializer corpusSerializer = new TEICorpusSerializer(
+					metadata, false, sid.orNull());
+			final FileBackedOutputStream corpusBuffer = new FileBackedOutputStream(
+					1024 * 1024, true);
+			corpusSerializer.write(corpusBuffer);
+			corpusBuffer.close();
+			this.source = new StreamSource(corpusBuffer.getSupplier()
+					.getInput());
+		} else if (sourceType == SourceType.XML && tryEmbeddedStylesheet) {
+			final FileBackedOutputStream xmlBuffer = new FileBackedOutputStream(
+					1024 * 1024, true);
+			ByteStreams.copy(content, xmlBuffer);
+			detectEmbeddedStylesheet(xmlBuffer.getSupplier().getInput());
+			this.source = new StreamSource(xmlBuffer.getSupplier().getInput());
+		} else {
+			this.source = new StreamSource(content.getInput(),
+					rootURI.toString());
+		}
+		return this;
+	}
+
+	private void detectEmbeddedStylesheet(final InputStream input) {
+		try {
+			final Source associatedStylesheet = TransformerFactory
+					.newInstance().getAssociatedStylesheet(
+							new StreamSource(input, rootURI.toString()), null,
+							null, null);
+			this.associatedStylesheet = Optional.of(new URI(
+					associatedStylesheet.getSystemId()));
+
+		} catch (final TransformerConfigurationException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (final TransformerFactoryConfigurationError e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} catch (final URISyntaxException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+	}
+
+	private XsltExecutable getStylesheet() throws SaxonApiException,
+			IOException {
+		if (stylesheetURI.isPresent())
+			return service.getStylesheet(stylesheetURI.get(), sid,
+					refreshStylesheet);
+		else if (associatedStylesheet.isPresent())
+			return service.getStylesheet(associatedStylesheet.get(), sid,
+					refreshStylesheet);
+		else
+			return service.getToHtml();
+	}
+
+	protected HTMLWriter loadStylesheet() throws SaxonApiException,
+			IOException, ObjectNotFoundFault, MetadataParseFault, IoFault,
+			ProtocolNotImplementedFault, AuthFault {
+		if (source == null) {
+			loadSource();
+		}
+
+		final XsltTransformer transformer = getStylesheet().load();
+		if (sid.isPresent()) {
+			transformer.setURIResolver(new TGUriResolver(repository, sid));
+		}
+		transformer.setParameter(new QName("graphicsURLPattern"),
+				new XdmAtomicValue(repository.getCRUDRestEndpoint()
+						+ "/@URI@/data"
+						+ ((sid == null || "".equals(sid)) ? ""
+								: ("?sessionId=" + sid))));
+		if (css.isPresent()) {
+			transformer.setParameter(new QName("cssFile"), new XdmAtomicValue(
+					css.get()));
+		}
+		this.transformer = transformer;
+		return this;
+	}
+
+	@Override
+	public void write(final OutputStream out) throws IOException,
+			WebApplicationException {
+		try {
+			if (source == null) {
+				loadSource();
+			}
+			if (transformer == null) {
+				loadStylesheet();
+			}
+			transformer
+					.setDestination(service.xsltProcessor.newSerializer(out));
+			transformer.transform();
+
+		} catch (final Exception e) {
+			Throwables.propagateIfPossible(e, IOException.class,
+					WebApplicationException.class);
+		}
+
+	}
+
+}
-- 
GitLab