@conference {360, title = {Contrastive Entity Linkage: Mining Variational Attributes from Large Catalogs for Entity Linkage}, booktitle = {Automated Knowledge Base Construction (AKBC)}, year = {2020}, abstract = {Presence of near identical, but distinct, entities called entity variations makes the task of data integration challenging. For example, in the domain of grocery products, variations share the same value for attributes such as brand, manufacturer and product line, but differ in other attributes, called variational attributes, such as package size and color. Identifying variations across data sources is an important task in itself and is crucial for identifying duplicates. However, this task is challenging as the variational attributes are often present as a part of unstructured text and are domain dependent. In this work, we propose our approach, Contrastive entity linkage, to identify both entity pairs that are the same and pairs that are variations of each other. We propose a novel unsupervised approach, VarSpot, to mine domain-dependent variational attributes present in unstructured text. The proposed approach reasons about both similarities and differences between entities and can easily scale to large sources containing millions of entities. We show the generality of our approach by performing experimental evaluation on three different domains. Our approach significantly outperforms state-of-the-art learning-based and rule-based entity linkage systems by up to 4\% F1 score when identifying duplicates, and up to 41\% when identifying entity variations.}, author = {Varun Embar and Bunyamin Sisman and Hao Wei and Xin Luna Dong and Christos Faloutsos and Lise Getoor} } @conference {357, title = {Estimating Aggregate Properties In Relational Networks With Unobserved Data}, booktitle = {AAAI Workshop on Statistical Relational Artificial Intelligence (StarAI)}, year = {2020}, abstract = {Aggregate network properties such as cluster cohesion and the number of bridge nodes can be used to glean insights about a network{\textquoteright}s community structure, spread of influence and the resilience of the network to faults. Efficiently computing network properties when the network is fully observed has received significant attention (Wasserman and Faust 1994; Cook and Holder 2006), however the problem of computing aggregate network properties when there is missing data attributes has received little attention. Computing these properties for networks with missing attributes involves performing inference over the network. Statistical relational learning (SRL) and graph neural networks (GNNs) are two classes of machine learning approaches well suited for inferring missing attributes in a graph. In this paper, we study the effectiveness of these approaches in estimating aggregate properties on networks with missing attributes. We compare two SRL approaches and three GNNs. For these approaches we estimate these properties using point estimates such as MAP and mean. For SRL-based approaches that can infer a joint distribution over the missing attributes, we also estimate these properties as an expectation over the distribution. To compute the expectation tractably for probabilistic soft logic, one of the SRL approaches that we study, we introduce a novel sampling framework. In the experimental evaluation, using three benchmark datasets, we show that SRL-based approaches tend to outperform GNN-based approaches both in computing aggregate properties and predictive accuracy. Specifically, we show that estimating the aggregate properties as an expectation over the joint distribution outperforms point estimates. }, author = {Varun Embar and Sriram Srinivasan and Lise Getoor} } @conference {349, title = {Collective Alignment of Large-scale Ontologies}, booktitle = {AKBC Workshop on Federated Knowledge Bases (FKBs)}, year = {2019}, abstract = {The rapid growth in digitization of data has led to creation of fragmented but vital knowledge sources. Ontologies are one such crucial source of knowledge and aligning them is a key challenge for creating an Open Knowledge Network. The task of ontology alignment has received significant attention. In this abstract, we building on existing work, and propose a novel probabilistic ontology alignment approach that combines several similarity measures with structural information such as subsumption and mutual exclusion. Most large-scale ontologies such as product catalogs [Agrawal et. al. 2001] and folksonomies [Plangprasopchok et. al. 2010] do not have a formally defined ontology with well-defined classes, instances and properties. Instead, they loosely define relationships such as subsumption between various entities. For example, a folksonomy for Instagram would contain not only tags corresponding to people, places and activities but also tags such as Selfie, which correspond to a type of image. Product catalogs have very different textual representation for the same entity. For instance, products related to 3D printing are present in a category called 3D Printing \& Supplies on Ebay, while the same products are present in a category called Additive Manufacturing Products on Amazon. Moreover, the same textual representation might have different semantics based on the source of the ontology. The category Headphones in an ontology corresponding to a particular company is different from the Headphones category of a large e-commerce retailer such as Amazon. Even aligning tracks in a music catalog is considerably challenging as it is unclear whether the tracks Bohemian Rhapsody OST and Bohemian Rhapsody Remastered 201l are the same. To sum up, ontology alignment is challenging due to informally defined subsumptions, multiple textual representations for the same class, ambiguity of similar textual representations and presence of large number of instance variations . Existing ontology alignment approached can be classified into schema-based approaches, instance-based approaches and hybrid approaches [Euzenat et. al. 2007]. Hybrid approaches such as InformationFlow-based Map [Kalfoglou et. al., 2003] combines string-based heuristics and the structure of the ontology to generate alignments. Naive Ontology Mapping [Ehrig et. al., 2004] makes uses of rules that exploit information present in the ontology. Motivated by these hybrid methods, our proposed ontology alignment approach combines several similarity and distance scores with soft structural constraints. We then define a probability distribution over the set of all possible alignments that takes into account correlations between different alignments. Apart from similarity scores computed on the textual representation of entities, we also compute scores using the entity hierarchy described by the subsumption relations. This helps in identifying the semantics on each entity. Apart from structural constraints such as mutual exclusion, we also incorporate relation specific constraints. For instance, it is unlikely that multiple entities that have a parent-child relationship align to a single entity. We use Probabilistic Soft Logic(PSL)[Bach et. al. 2017], a powerful probabilistic programming framework, that uses weight first-order logic rules to define a probability distribution. Having defined the distribution, we use the efficient MAP inference supported by PSL to identify the most likely alignment. We performed experiments on product taxonomies extracted from four websites and compared our method to a tf-idf similarity score based approach. While the instance-based similarity score prevented aligning categories such as bicycle stands \& storage and storage \& home organization, the structural constraints helped distinguish between equivalence and more general relations. For example, beauty \& personal care was aligned to beauty and not hair care, even though there is a significant overlap of products, as hair care was the child of beauty in the product taxonomy. In summary, combining multiple scores and structural constraints using a probabilistic framework led to a 36\% improvement in precision and a 15\% improvement t in F1 score over the string similarity baseline.}, author = {Varun Embar and Jay Pujara and Lise Getoor} } @conference {350, title = {Tractable Marginal Inference for Hinge-Loss Markov Random Fields}, booktitle = {ICML Workshop on Tractable Probabilistic Modeling (TPM)}, year = {2019}, month = {06/2019}, abstract = {Hinge-loss Markov random fields (HL-MRFs) are a class of undirected graphical models that has been successfully applied to model richly structured data. HL-MRFs are defined over a set of continuous random variables in the range [0,1], which makes computing the MAP convex. However, computation of marginal distributions remain intractable. In this paper, we introduce a novel sampling-based algorithm to compute marginal distributions. We define the notion of association blocks, which help identify islands of high probability, and propose a novel approach to sample from these regions. We validate our approach by estimating both average precision and various properties of a social network. We show that the proposed approach outperforms MAP estimates in both average precision and the accuracy of the properties by 20\% and 40\% respectively on the large social network.}, author = {Varun Embar and Sriram Srinivasan and Lise Getoor} } @conference {321, title = {Aligning Product Categories using Anchor Products}, booktitle = {Workshop on Knowledge Base Construction, Reasoning and Mining (KBCOM)}, year = {2018}, abstract = {

E-commerce sites group similar products into categories, and these categories are further organized in a taxonomy. Since different sites have different products and cater to a variety of shoppers, the taxonomies differ both in the categorization of products and the textual representation used for these categories. In this paper, we propose a technique to align categories across sites, which is useful information to have in product graphs. We use breadcrumbs present on the product pages to infer a site{\textquoteright}s taxonomy. We generate a list of candidate category pairs for alignment using anchor products products present in two or more sites. We use multiple similarity and distance metrics to compare these candidates. To generate the final set of alignments, we propose a model that combines these metrics with a set of structural constraints. The model is based on probabilistic soft logic (PSL), a scalable probabilistic programming framework. We run experiments on data extracted from Amazon, Ebay, Staples and Target, and show that the distance metric based on products, and the use of PSL to combine various metrics and structural constraints lead to improved alignments.

}, author = {Varun Embar and Golnoosh Farnadi and Jay Pujara and Lise Getoor} } @conference {332, title = {Scalable Structure Learning for Probabilistic Soft Logic}, booktitle = {IJCAI Workshop on Statistical Relational AI (StarAI)}, year = {2018}, month = {06/2018}, abstract = {

Statistical relational frameworks such as Markov logic networks and probabilistic soft logic (PSL) encode model structure with weighted first-order logical clauses. Learning these clauses from data is referred to as structure learning. Structure learning alleviates the manual cost of specifying models. However, this benefit comes with high computational costs; structure learning typically requires an expensive search over the space of clauses which involves repeated optimization of clause weights. In this paper, we propose the first two approaches to structure learning for PSL. We introduce a greedy search-based algorithm and a novel optimization method that trade-off scalability and approximations to the structure learning problem in varying ways. The highly scalable optimization method combines data-driven generation of clauses with a piecewise pseudolikelihood (PPLL) objective that learns model structure by optimizing clause weights only once. We compare both methods across five real-world tasks, showing that PPLL achieves an order of magnitude runtime speedup and AUC gains up to 15\% over greedy search.

}, author = {Varun Embar and Dhanya Sridhar and Golnoosh Farnadi and Lise Getoor} }