From e8c6469cb924440df13d276bf6714c484db6b124 Mon Sep 17 00:00:00 2001 From: Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de> Date: Tue, 3 Sep 2013 18:35:25 +0200 Subject: [PATCH] Refactored towards HTMLWriter. Need tests and cleanup. --- .../services/aggregator/html/HTML.java | 103 ++----- .../services/aggregator/html/HTMLWriter.java | 265 ++++++++++++++++++ 2 files changed, 281 insertions(+), 87 deletions(-) create mode 100644 src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java diff --git a/src/main/java/info/textgrid/services/aggregator/html/HTML.java b/src/main/java/info/textgrid/services/aggregator/html/HTML.java index 881204f..ad7f358 100644 --- a/src/main/java/info/textgrid/services/aggregator/html/HTML.java +++ b/src/main/java/info/textgrid/services/aggregator/html/HTML.java @@ -1,22 +1,17 @@ package info.textgrid.services.aggregator.html; -import info.textgrid.namespaces.metadata.core._2010.MetadataContainerType; import info.textgrid.namespaces.metadata.core._2010.ObjectType; import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.AuthFault; import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.IoFault; import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.MetadataParseFault; import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ObjectNotFoundFault; import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ProtocolNotImplementedFault; -import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.TGCrudService; -import info.textgrid.services.aggregator.GenericExceptionMapper; import info.textgrid.services.aggregator.ITextGridRep; import info.textgrid.services.aggregator.ITextGridRep.TGOSupplier; import info.textgrid.services.aggregator.TextGridRepProvider; -import info.textgrid.services.aggregator.teicorpus.TEICorpusSerializer; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; @@ -33,27 +28,22 @@ import javax.ws.rs.QueryParam; import javax.ws.rs.WebApplicationException; import javax.ws.rs.core.Context; -import javax.ws.rs.core.Response.Status; import javax.ws.rs.core.StreamingOutput; import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.Processor; -import net.sf.saxon.s9api.QName; import net.sf.saxon.s9api.SaxonApiException; -import net.sf.saxon.s9api.XdmAtomicValue; import net.sf.saxon.s9api.XsltCompiler; import net.sf.saxon.s9api.XsltExecutable; -import net.sf.saxon.s9api.XsltTransformer; import org.apache.cxf.jaxrs.model.wadl.Description; -import com.google.common.base.Stopwatch; +import com.google.common.base.Optional; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.RemovalListener; import com.google.common.cache.RemovalNotification; -import com.google.common.io.FileBackedOutputStream; @Path("/html") @Description("Creates an HTML representation of the given TEI document, or aggregation of TEI documents. This is currently extremely experimental and probably broken.") @@ -62,20 +52,20 @@ public class HTML { private static final String TO_HTML_XSL = "/WEB-INF/stylesheets/db2xhtml.xsl"; - private ITextGridRep repository = TextGridRepProvider.getInstance(); + ITextGridRep repository = TextGridRepProvider.getInstance(); final Logger logger = Logger .getLogger("info.textgrid.services.aggregator.html.HTML"); private XsltExecutable toHtml; - private final Processor xsltProcessor; + final Processor xsltProcessor; @Context private ServletContext servlet; private Cache<URI, XsltExecutable> stylesheets; - private XsltExecutable getToHtml() { + XsltExecutable getToHtml() { if (toHtml == null) { try { final URL stylesheet = servlet.getResource(TO_HTML_XSL); @@ -148,15 +138,16 @@ public XsltExecutable load(final URI url) throws Exception { * @param uri * the URI of the stylesheet to load * @param sid - * the session ID to use, or null. + * the session ID to use, if present * @param forceLoad - * TODO + * do not use a cached version even if present. * @throws IOException * if an error occurs reading the stylesheet. * @throws SaxonApiException * if saxon fails to compile the stylesheet. */ - protected XsltExecutable getStylesheet(final URI uri, final String sid, + protected XsltExecutable getStylesheet(final URI uri, + final Optional<String> sid, final boolean forceLoad) throws SaxonApiException, IOException { XsltExecutable executable = null; @@ -171,7 +162,8 @@ protected XsltExecutable getStylesheet(final URI uri, final String sid, if (TGUriResolver.isResolveable(uri)) { // (2/3) it's a TextGrid object, load it from TG-crud. - final TGOSupplier<InputStream> xsltSupplier = repository.read(uri, sid); + final TGOSupplier<InputStream> xsltSupplier = repository.read( + uri, sid.orNull()); executable = compiler.compile(new StreamSource(xsltSupplier .getInput(), uri.toString())); @@ -222,74 +214,11 @@ public StreamingOutput get( SaxonApiException, ExecutionException { logger.fine("HTML called for root object: " + uri); - final Stopwatch stopwatch = new Stopwatch(); - stopwatch.start(); - - final TGCrudService crud = repository.getCRUDService(); - final MetadataContainerType container = crud.readMetadata(sid, null, - uri.toString()); - final ObjectType rootObject = container.getObject(); - final String mimeType = rootObject.getGeneric().getProvided().getFormat(); - final boolean aggregation = mimeType.contains("aggregation"); - if (!aggregation && !mimeType.matches("^text/.*xml.*")) { - final String errorMsg = MessageFormat.format("The HTML export can only convert aggregations or XML documents to EPUB, however, the document {0} you referred to has the MIME type {1}.", uri, mimeType); - throw new WebApplicationException( - GenericExceptionMapper.toResponse( - Status.UNSUPPORTED_MEDIA_TYPE, errorMsg, "")); - } - final InputStream tei; - if (aggregation) { - final TEICorpusSerializer corpusSerializer = new TEICorpusSerializer( - rootObject, false, sid); - final FileBackedOutputStream corpusBuffer = new FileBackedOutputStream( - 1024 * 1024, true); - corpusSerializer.write(corpusBuffer); - corpusBuffer.close(); - tei = corpusBuffer.getSupplier().getInput(); - logger.fine(" created intermediate corpus for " + uri); - } else { - tei = repository.getContent(uri, sid); - } - logger.info("we have an input document after " + stopwatch.toString()); - - final XsltTransformer transformer; - if (xsluri == null || "".equals(xsluri)) { - transformer = getToHtml().load(); - } else { - transformer = getStylesheet(xsluri, sid, refreshStylesheet).load(); - if (sid != null) { - transformer.setURIResolver(new TGUriResolver(repository, sid)); - } // otherwise default public URI resolver - } - - transformer.setSource(new StreamSource(tei)); - transformer.setParameter(new QName("graphicsURLPattern"), - new XdmAtomicValue(repository.getCRUDRestEndpoint() - + "/@URI@/data" - + ((sid == null || "".equals(sid)) ? "" - : ("?sessionId=" + sid)))); - if (css != null) { - transformer.setParameter(new QName("cssFile"), new XdmAtomicValue(css)); - } - - logger.info("we're ready to transform after " + stopwatch.toString()); - return new StreamingOutput() { - - @Override - public void write(final OutputStream output) throws IOException, - WebApplicationException { - transformer.setDestination(xsltProcessor.newSerializer(output)); - try { - transformer.transform(); - logger.info(MessageFormat - .format("Finished transformation to HTML for {0} after {1}", - uri, stopwatch.toString())); - } catch (final SaxonApiException e) { - throw new WebApplicationException(e); - } - } - }; - - + final HTMLWriter writer = new HTMLWriter(this, uri).stylesheet(xsluri) + .sid(sid).refresh(refreshStylesheet).css(css); + writer.loadSource(); + writer.loadStylesheet(); + return writer; } + } diff --git a/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java b/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java new file mode 100644 index 0000000..b52d46a --- /dev/null +++ b/src/main/java/info/textgrid/services/aggregator/html/HTMLWriter.java @@ -0,0 +1,265 @@ +package info.textgrid.services.aggregator.html; + +import info.textgrid.namespaces.metadata.core._2010.ObjectType; +import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.AuthFault; +import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.IoFault; +import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.MetadataParseFault; +import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ObjectNotFoundFault; +import info.textgrid.namespaces.middleware.tgcrud.services.tgcrudservice.ProtocolNotImplementedFault; +import info.textgrid.services.aggregator.GenericExceptionMapper; +import info.textgrid.services.aggregator.ITextGridRep; +import info.textgrid.services.aggregator.ITextGridRep.TGOSupplier; +import info.textgrid.services.aggregator.teicorpus.TEICorpusSerializer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; +import java.net.URISyntaxException; +import java.text.MessageFormat; + +import javax.ws.rs.WebApplicationException; +import javax.ws.rs.core.Response.Status; +import javax.ws.rs.core.StreamingOutput; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.TransformerFactoryConfigurationError; +import javax.xml.transform.stream.StreamSource; + +import net.sf.saxon.s9api.QName; +import net.sf.saxon.s9api.SaxonApiException; +import net.sf.saxon.s9api.XdmAtomicValue; +import net.sf.saxon.s9api.XsltExecutable; +import net.sf.saxon.s9api.XsltTransformer; + +import com.google.common.base.Optional; +import com.google.common.base.Throwables; +import com.google.common.io.ByteStreams; +import com.google.common.io.FileBackedOutputStream; + +/** + * The essential steps: + * + * <ol> + * <li>Construct the transformation source. + * <p> + * Depends on different factors: If it's an aggregation, we need to recursively + * construct a teiCorpus first. If we need to parse the + * {@code <?xsl-stylesheet?>} processing instruction, we need to at least + * partially parse the document twice, so we download it to a temporary buffer. + * Otherwise, the source could be the result stream of the call to TG-crud. + * </p> + * <p> + * Includes some, potentially longish, requests to TextGridRep or other sources + * that may cause exceptions. + * </p> + * </li> + * <li>Load and configure the stylesheet. + * <p> + * Typically, the order is argument-specified > associated (via pi) > default + * stylesheet, with the associated stylesheet only checked on request. + * </p> + * </li> + * <li>Perform the transformation and output the result. + * <p> + * This step essentially requires the output stream to be present. + * </p> + * </li> + * </ol> + * + * @author Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de> + * + */ +public class HTMLWriter implements StreamingOutput { + + private final HTML service; + + // Options + private final URI rootURI; + private Optional<URI> stylesheetURI = Optional.absent(); + private boolean refreshStylesheet = false; + private boolean tryEmbeddedStylesheet = false; + private Optional<URI> css = Optional.absent(); + + // detected and extracted + private enum SourceType { + UNKNOWN, XML, AGGREGATION + } + + private SourceType sourceType = SourceType.UNKNOWN; + + private final ITextGridRep repository; + + private Optional<String> sid; + + private ObjectType metadata; + + private Optional<URI> associatedStylesheet = Optional.absent(); + + private Source source = null; + + private XsltTransformer transformer; + + // Constructor and configuration + + public HTMLWriter(final HTML service, final URI rootURI) { + this.service = service; + this.rootURI = rootURI; + this.repository = service.repository; + } + + public HTMLWriter sid(final String sid) { + if (sid == null || sid.isEmpty()) { + this.sid = Optional.absent(); + } else { + this.sid = Optional.of(sid); + } + return this; + } + + public HTMLWriter stylesheet(final URI uri) { + this.stylesheetURI = Optional.fromNullable(uri); + return this; + } + + public HTMLWriter refresh(final boolean refresh) { + this.refreshStylesheet = refresh; + return this; + } + + public HTMLWriter embedded(final boolean embedded) { + this.tryEmbeddedStylesheet = embedded; + if (embedded) { + this.sourceType = SourceType.XML; + } + return this; + } + + public HTMLWriter css(final URI css) { + this.css = Optional.fromNullable(css); + return this; + } + + protected HTMLWriter loadSource() throws ObjectNotFoundFault, + MetadataParseFault, + IoFault, ProtocolNotImplementedFault, AuthFault, IOException { + + TGOSupplier<InputStream> content = null; + content = service.repository.read(rootURI, sid.orNull()); + metadata = content.getMetadata(); + final String format = metadata.getGeneric().getProvided().getFormat(); + if (format.contains("aggregation")) { + sourceType = SourceType.AGGREGATION; + } else if (format.matches("^text/.*xml.*$")) { + sourceType = SourceType.XML; + } else { + final String errorMsg = MessageFormat + .format("The HTML export can only convert aggregations or XML documents to EPUB, however, the document {0} you referred to has the MIME type {1}.", + rootURI, format); + throw new WebApplicationException( + GenericExceptionMapper.toResponse( + Status.UNSUPPORTED_MEDIA_TYPE, errorMsg, "")); + } + + if (sourceType == SourceType.AGGREGATION) { + final TEICorpusSerializer corpusSerializer = new TEICorpusSerializer( + metadata, false, sid.orNull()); + final FileBackedOutputStream corpusBuffer = new FileBackedOutputStream( + 1024 * 1024, true); + corpusSerializer.write(corpusBuffer); + corpusBuffer.close(); + this.source = new StreamSource(corpusBuffer.getSupplier() + .getInput()); + } else if (sourceType == SourceType.XML && tryEmbeddedStylesheet) { + final FileBackedOutputStream xmlBuffer = new FileBackedOutputStream( + 1024 * 1024, true); + ByteStreams.copy(content, xmlBuffer); + detectEmbeddedStylesheet(xmlBuffer.getSupplier().getInput()); + this.source = new StreamSource(xmlBuffer.getSupplier().getInput()); + } else { + this.source = new StreamSource(content.getInput(), + rootURI.toString()); + } + return this; + } + + private void detectEmbeddedStylesheet(final InputStream input) { + try { + final Source associatedStylesheet = TransformerFactory + .newInstance().getAssociatedStylesheet( + new StreamSource(input, rootURI.toString()), null, + null, null); + this.associatedStylesheet = Optional.of(new URI( + associatedStylesheet.getSystemId())); + + } catch (final TransformerConfigurationException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (final TransformerFactoryConfigurationError e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (final URISyntaxException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private XsltExecutable getStylesheet() throws SaxonApiException, + IOException { + if (stylesheetURI.isPresent()) + return service.getStylesheet(stylesheetURI.get(), sid, + refreshStylesheet); + else if (associatedStylesheet.isPresent()) + return service.getStylesheet(associatedStylesheet.get(), sid, + refreshStylesheet); + else + return service.getToHtml(); + } + + protected HTMLWriter loadStylesheet() throws SaxonApiException, + IOException, ObjectNotFoundFault, MetadataParseFault, IoFault, + ProtocolNotImplementedFault, AuthFault { + if (source == null) { + loadSource(); + } + + final XsltTransformer transformer = getStylesheet().load(); + if (sid.isPresent()) { + transformer.setURIResolver(new TGUriResolver(repository, sid)); + } + transformer.setParameter(new QName("graphicsURLPattern"), + new XdmAtomicValue(repository.getCRUDRestEndpoint() + + "/@URI@/data" + + ((sid == null || "".equals(sid)) ? "" + : ("?sessionId=" + sid)))); + if (css.isPresent()) { + transformer.setParameter(new QName("cssFile"), new XdmAtomicValue( + css.get())); + } + this.transformer = transformer; + return this; + } + + @Override + public void write(final OutputStream out) throws IOException, + WebApplicationException { + try { + if (source == null) { + loadSource(); + } + if (transformer == null) { + loadStylesheet(); + } + transformer + .setDestination(service.xsltProcessor.newSerializer(out)); + transformer.transform(); + + } catch (final Exception e) { + Throwables.propagateIfPossible(e, IOException.class, + WebApplicationException.class); + } + + } + +} -- GitLab