package org.nuxeo.ecm.platform.semanticentities.extraction;

import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Property;
import com.hp.hpl.jena.rdf.model.ResIterator;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.rdf.model.Statement;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.ByteArrayEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.nuxeo.common.utils.StringUtils;
import org.nuxeo.ecm.automation.core.annotations.Context;
import org.nuxeo.ecm.automation.core.annotations.Operation;
import org.nuxeo.ecm.automation.core.annotations.OperationMethod;
import org.nuxeo.ecm.automation.core.annotations.Param;
import org.nuxeo.ecm.core.api.Blob;
import org.nuxeo.ecm.core.api.ClientException;
import org.nuxeo.ecm.core.api.CoreSession;
import org.nuxeo.ecm.core.api.DocumentModel;
import org.nuxeo.ecm.core.api.DocumentModelList;
import org.nuxeo.ecm.core.api.DocumentRef;
import org.nuxeo.ecm.core.api.DocumentRefList;
import org.nuxeo.ecm.core.api.blobholder.BlobHolder;
import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder;
import org.nuxeo.ecm.core.api.impl.DocumentModelListImpl;
import org.nuxeo.ecm.core.api.impl.blob.StreamingBlob;
import org.nuxeo.ecm.core.api.model.PropertyException;
import org.nuxeo.ecm.core.api.pathsegment.PathSegmentService;
import org.nuxeo.ecm.core.convert.api.ConversionService;
import org.nuxeo.ecm.core.schema.SchemaManager;
import org.nuxeo.ecm.core.utils.BlobsExtractor;
import org.nuxeo.ecm.platform.semanticentities.EntitySuggestion;
import org.nuxeo.ecm.platform.semanticentities.LocalEntityService;
import org.nuxeo.ecm.platform.semanticentities.adapter.OccurrenceGroup;
import org.nuxeo.ecm.platform.semanticentities.adapter.OccurrenceInfo;
import org.nuxeo.runtime.api.Framework;

@Operation(id = OccurrenceExtractionOperation.ID, category = "Document", label = "Extract occurrences", description = "Extract the text and launch an use a semantic engine to extract and link occurrences of semantic entities. Returns back the analyzed document.")
/* loaded from: input_file:org/nuxeo/ecm/platform/semanticentities/extraction/OccurrenceExtractionOperation.class */
public class OccurrenceExtractionOperation {
    public static final String ID = "Document.ExtractSemanticEntitiesOccurrences";
    private static final String ANY2TEXT = "any2text";
    protected static final String DEFAULT_ENGINE_URL = "https://stanbol.demo.nuxeo.com/engines";
    protected static final String ENGINE_URL_PROPERTY = "org.nuxeo.ecm.platform.semanticentities.stanbolUrl";
    protected static final String DEFAULT_SPARQL_QUERY = "SELECT ?label ?type ?context ";
    protected static final String DEFAULT_SOURCE_NAME = "dbpedia";
    protected static final String DEFAULT_ENGINE_OUTPUT_FORMAT = "application/rdf+xml";
    protected ConversionService conversionService;
    protected HttpClient httpClient;

    @Context
    protected CoreSession session;

    @Param(name = "engineURL", required = true, values = {DEFAULT_ENGINE_URL})
    protected String engineURL;

    @Param(name = "sparqlQuery", required = true, values = {DEFAULT_SPARQL_QUERY})
    protected String sparqlQuery;

    @Param(name = "sourceName", required = true, values = {DEFAULT_SOURCE_NAME})
    protected String sourceName;

    @Param(name = "engineOutputFormat", required = true, values = {DEFAULT_ENGINE_OUTPUT_FORMAT})
    protected String outputFormat;

    @Param(name = "linkToUnrecognizedEntities", required = true, values = {"true"})
    protected boolean linkToUnrecognizedEntities;

    @Param(name = "linkToAmbiguousEntities", required = true, values = {"false"})
    protected boolean linkToAmbiguousEntities;

    @Param(name = "linkShortPersonNames", required = true, values = {"false"})
    protected boolean linkShortPersonNames;
    private static final Log log = LogFactory.getLog(OccurrenceExtractionOperation.class);
    protected static final Map<String, String> localTypes = new HashMap();

    public OccurrenceExtractionOperation() throws Exception {
        this.engineURL = null;
        this.sparqlQuery = DEFAULT_SPARQL_QUERY;
        this.sourceName = DEFAULT_SOURCE_NAME;
        this.outputFormat = DEFAULT_ENGINE_OUTPUT_FORMAT;
        this.linkToUnrecognizedEntities = true;
        this.linkToAmbiguousEntities = false;
        this.linkShortPersonNames = false;
        this.conversionService = (ConversionService) Framework.getService(ConversionService.class);
        initHttpClient();
    }

    public OccurrenceExtractionOperation(CoreSession coreSession) throws Exception {
        this();
        this.session = coreSession;
    }

    protected void initHttpClient() {
        SchemeRegistry schemeRegistry = new SchemeRegistry();
        schemeRegistry.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
        schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
        BasicHttpParams basicHttpParams = new BasicHttpParams();
        this.httpClient = new DefaultHttpClient(new ThreadSafeClientConnManager(basicHttpParams, schemeRegistry), basicHttpParams);
    }

    @OperationMethod
    public DocumentRef run(DocumentRef documentRef) throws Exception {
        return run(this.session.getDocument(documentRef)).getRef();
    }

    @OperationMethod
    public DocumentModel run(DocumentModel documentModel) throws Exception {
        SchemaManager schemaManager = (SchemaManager) Framework.getService(SchemaManager.class);
        if (schemaManager.getDocumentTypeNamesExtending("Entity").contains(documentModel.getType()) || schemaManager.getDocumentTypeNamesExtending("Occurrence").contains(documentModel.getType())) {
            return documentModel;
        }
        String str = (String) documentModel.getProperty("dc:language").getValue(String.class);
        if (str != null && !str.isEmpty() && !"en".equalsIgnoreCase(str) && !"english".equalsIgnoreCase(str)) {
            return documentModel;
        }
        List<OccurrenceGroup> findStanbolEntityOccurrences = findStanbolEntityOccurrences(ModelFactory.createDefaultModel().read(new StringReader(callSemanticEngine(extractText(documentModel), this.outputFormat)), (String) null));
        if (findStanbolEntityOccurrences.isEmpty()) {
            return documentModel;
        }
        LocalEntityService localEntityService = (LocalEntityService) Framework.getService(LocalEntityService.class);
        DocumentModel entityContainer = localEntityService.getEntityContainer(this.session);
        for (OccurrenceGroup occurrenceGroup : findStanbolEntityOccurrences) {
            if (this.linkShortPersonNames || !"Person".equals(occurrenceGroup.type) || occurrenceGroup.name.trim().split(" ").length > 1) {
                List suggestEntity = localEntityService.suggestEntity(this.session, occurrenceGroup.name, occurrenceGroup.type, 3);
                if (suggestEntity.isEmpty() && this.linkToUnrecognizedEntities) {
                    PathSegmentService pathSegmentService = (PathSegmentService) Framework.getService(PathSegmentService.class);
                    DocumentModel createDocumentModel = this.session.createDocumentModel(occurrenceGroup.type);
                    createDocumentModel.setPropertyValue("dc:title", occurrenceGroup.name);
                    createDocumentModel.setPathInfo(entityContainer.getPathAsString(), pathSegmentService.generatePathSegment(createDocumentModel));
                    DocumentModel createDocument = this.session.createDocument(createDocumentModel);
                    this.session.save();
                    localEntityService.addOccurrences(this.session, documentModel.getRef(), createDocument.getRef(), occurrenceGroup.occurrences);
                } else if (suggestEntity.size() <= 1 || this.linkToAmbiguousEntities) {
                    localEntityService.addOccurrences(this.session, documentModel.getRef(), (EntitySuggestion) suggestEntity.get(0), occurrenceGroup.occurrences);
                }
            }
        }
        return documentModel;
    }

    public List<OccurrenceGroup> findStanbolEntityOccurrences(Model model) {
        Statement property;
        OccurrenceInfo occurrenceInfo;
        Property property2 = model.getProperty("http://www.w3.org/1999/02/22-rdf-syntax-ns#type");
        Property property3 = model.getProperty("http://purl.org/dc/terms/type");
        Property property4 = model.getProperty("http://purl.org/dc/terms/relation");
        ResIterator listSubjectsWithProperty = model.listSubjectsWithProperty(property2, model.getResource("http://fise.iks-project.eu/ontology/TextAnnotation"));
        ArrayList arrayList = new ArrayList();
        while (listSubjectsWithProperty.hasNext()) {
            Resource nextResource = listSubjectsWithProperty.nextResource();
            if (!model.listObjectsOfProperty(nextResource, property4).hasNext() && (property = nextResource.getProperty(property3)) != null && property.getObject().isURIResource()) {
                String str = localTypes.get(property.getObject().as(Resource.class).getURI());
                if (str != null && (occurrenceInfo = getOccurrenceInfo(model, nextResource)) != null) {
                    OccurrenceGroup occurrenceGroup = new OccurrenceGroup(occurrenceInfo.mention, str);
                    occurrenceGroup.occurrences.add(occurrenceInfo);
                    ResIterator listSubjectsWithProperty2 = model.listSubjectsWithProperty(property4, nextResource);
                    while (listSubjectsWithProperty2.hasNext()) {
                        OccurrenceInfo occurrenceInfo2 = getOccurrenceInfo(model, listSubjectsWithProperty2.nextResource());
                        if (occurrenceInfo2 != null) {
                            occurrenceGroup.occurrences.add(occurrenceInfo2);
                        }
                    }
                    arrayList.add(occurrenceGroup);
                }
            }
        }
        return arrayList;
    }

    protected OccurrenceInfo getOccurrenceInfo(Model model, Resource resource) {
        Statement property = resource.getProperty(model.getProperty("http://fise.iks-project.eu/ontology/selected-text"));
        if (property == null || !property.getObject().isLiteral()) {
            return null;
        }
        String trim = property.getObject().as(Literal.class).getString().trim();
        Statement property2 = resource.getProperty(model.getProperty("http://fise.iks-project.eu/ontology/selection-context"));
        if (property2 == null || !property2.getObject().isLiteral()) {
            return new OccurrenceInfo(trim, trim);
        }
        String trim2 = property2.getObject().as(Literal.class).getString().trim();
        if (!trim2.contains(trim) || trim2.length() > 500) {
            trim2 = trim;
        }
        return new OccurrenceInfo(trim, trim2);
    }

    @OperationMethod
    public DocumentModelList run(DocumentModelList documentModelList) throws Exception {
        DocumentModelListImpl documentModelListImpl = new DocumentModelListImpl((int) documentModelList.totalSize());
        Iterator it = documentModelList.iterator();
        while (it.hasNext()) {
            documentModelListImpl.add(run((DocumentModel) it.next()));
        }
        return documentModelListImpl;
    }

    @OperationMethod
    public DocumentModelList run(DocumentRefList documentRefList) throws Exception {
        DocumentModelListImpl documentModelListImpl = new DocumentModelListImpl((int) documentRefList.totalSize());
        Iterator it = documentRefList.iterator();
        while (it.hasNext()) {
            documentModelListImpl.add(this.session.getDocument(run((DocumentRef) it.next())));
        }
        return documentModelListImpl;
    }

    protected String callSemanticEngine(String str, String str2) throws ClientProtocolException, IOException {
        String str3 = this.engineURL;
        if (str3 == null) {
            str3 = Framework.getProperty(ENGINE_URL_PROPERTY, DEFAULT_ENGINE_URL);
            if (str3.trim().isEmpty()) {
                str3 = DEFAULT_ENGINE_URL;
            }
        }
        HttpPost httpPost = new HttpPost(str3);
        try {
            httpPost.setHeader("Accept", str2);
            httpPost.setHeader("Content-Type", "text/plain");
            httpPost.setEntity(new ByteArrayEntity(str.getBytes("utf-8")));
            HttpResponse execute = this.httpClient.execute(httpPost);
            InputStream content = execute.getEntity().getContent();
            String iOUtils = IOUtils.toString(content);
            content.close();
            if (execute.getStatusLine().getStatusCode() == 200) {
                return iOUtils;
            }
            String obj = execute.getStatusLine().toString();
            log.error(obj + ":\n" + iOUtils);
            throw new IOException(obj);
        } catch (ClientProtocolException e) {
            httpPost.abort();
            throw e;
        } catch (IOException e2) {
            httpPost.abort();
            throw e2;
        }
    }

    protected String extractText(DocumentModel documentModel) throws ClientException {
        StringBuilder sb = new StringBuilder();
        sb.append(documentModel.getTitle());
        sb.append("\n\n");
        Serializable propertyValue = documentModel.getPropertyValue("dc:description");
        if (propertyValue != null) {
            sb.append(propertyValue);
            sb.append("\n\n");
        }
        try {
            StreamingBlob createFromString = StreamingBlob.createFromString((String) documentModel.getPropertyValue("note:note"));
            createFromString.setMimeType("text/html");
            sb.append(this.conversionService.convert(ANY2TEXT, new SimpleBlobHolder(createFromString), (Map) null).getBlob().getString());
            sb.append("\n\n");
        } catch (IOException e) {
            throw new ClientException(e);
        } catch (PropertyException e2) {
        }
        if (documentModel.hasFacet("HasRelatedText")) {
            Iterator it = ((List) documentModel.getProperty("relatedtext:relatedtextresources").getValue(List.class)).iterator();
            while (it.hasNext()) {
                String str = (String) ((Map) it.next()).get("relatedtext");
                if (str != null && !str.trim().isEmpty()) {
                    sb.append(str);
                    sb.append("\n\n");
                }
            }
        }
        sb.append(blobsToText(new BlobsExtractor().getBlobs(documentModel)));
        return sb.toString();
    }

    protected String blobsToText(List<Blob> list) {
        Blob blob;
        LinkedList linkedList = new LinkedList();
        Iterator<Blob> it = list.iterator();
        while (it.hasNext()) {
            try {
                BlobHolder convert = this.conversionService.convert(ANY2TEXT, new SimpleBlobHolder(it.next()), (Map) null);
                if (convert != null && (blob = convert.getBlob()) != null) {
                    String str = new String(blob.getByteArray(), "UTF-8");
                    if (str.indexOf(0) >= 0) {
                        str = str.replace("��", " ");
                    }
                    linkedList.add(str);
                }
            } catch (Exception e) {
                log.error(e.getMessage(), e);
            }
        }
        return StringUtils.join(linkedList, "\n\n");
    }

    static {
        localTypes.put("http://dbpedia.org/ontology/Place", "Place");
        localTypes.put("http://dbpedia.org/ontology/Person", "Person");
        localTypes.put("http://dbpedia.org/ontology/Organisation", "Organization");
    }
}
