@conference {359, title = {Causal Relational Learning}, booktitle = {International Conference on Management of Data (SIGMOD)}, year = {2020}, abstract = {Causal inference is at the heart of empirical research in natural and social sciences and is critical for scientific discovery and informed decision making. The gold standard in causal inference is performing randomized controlled trials; unfortunately these are not always feasible due to ethical, legal, or cost constraints. As an alternative, methodologies for causal inference from observational data have been developed in statistical studies and social sciences. However, existing methods critically rely on restrictive assumptions such as the study population consisting of homogeneous elements that can be represented in a single flat table, where each row is referred to as a unit. In contrast, in many real-world settings, the study domain naturally consists of heterogeneous elements with complex relational structure, where the data is naturally represented in multiple related tables. In this paper, we present a formal framework for causal inference from such relational data.We propose a declarative language called CaRL for capturing causal background knowledge and assumptions, and specifying causal queries using simple Datalog-like rules. CaRL provides a foundation for inferring causality and reasoning about the effect of complex interventions in relational domains.We present an extensive experimental evaluation on real relational data to illustrate the applicability of CaRL in social sciences and healthcare.}, author = {Babak Salami and Harsh Parikh and Moe Kayali and Sudeepa Roy and Lise Getoor and Dan Suciu} } @conference {349, title = {Collective Alignment of Large-scale Ontologies}, booktitle = {AKBC Workshop on Federated Knowledge Bases (FKBs)}, year = {2019}, abstract = {The rapid growth in digitization of data has led to creation of fragmented but vital knowledge sources. Ontologies are one such crucial source of knowledge and aligning them is a key challenge for creating an Open Knowledge Network. The task of ontology alignment has received significant attention. In this abstract, we building on existing work, and propose a novel probabilistic ontology alignment approach that combines several similarity measures with structural information such as subsumption and mutual exclusion. Most large-scale ontologies such as product catalogs [Agrawal et. al. 2001] and folksonomies [Plangprasopchok et. al. 2010] do not have a formally defined ontology with well-defined classes, instances and properties. Instead, they loosely define relationships such as subsumption between various entities. For example, a folksonomy for Instagram would contain not only tags corresponding to people, places and activities but also tags such as Selfie, which correspond to a type of image. Product catalogs have very different textual representation for the same entity. For instance, products related to 3D printing are present in a category called 3D Printing \& Supplies on Ebay, while the same products are present in a category called Additive Manufacturing Products on Amazon. Moreover, the same textual representation might have different semantics based on the source of the ontology. The category Headphones in an ontology corresponding to a particular company is different from the Headphones category of a large e-commerce retailer such as Amazon. Even aligning tracks in a music catalog is considerably challenging as it is unclear whether the tracks Bohemian Rhapsody OST and Bohemian Rhapsody Remastered 201l are the same. To sum up, ontology alignment is challenging due to informally defined subsumptions, multiple textual representations for the same class, ambiguity of similar textual representations and presence of large number of instance variations . Existing ontology alignment approached can be classified into schema-based approaches, instance-based approaches and hybrid approaches [Euzenat et. al. 2007]. Hybrid approaches such as InformationFlow-based Map [Kalfoglou et. al., 2003] combines string-based heuristics and the structure of the ontology to generate alignments. Naive Ontology Mapping [Ehrig et. al., 2004] makes uses of rules that exploit information present in the ontology. Motivated by these hybrid methods, our proposed ontology alignment approach combines several similarity and distance scores with soft structural constraints. We then define a probability distribution over the set of all possible alignments that takes into account correlations between different alignments. Apart from similarity scores computed on the textual representation of entities, we also compute scores using the entity hierarchy described by the subsumption relations. This helps in identifying the semantics on each entity. Apart from structural constraints such as mutual exclusion, we also incorporate relation specific constraints. For instance, it is unlikely that multiple entities that have a parent-child relationship align to a single entity. We use Probabilistic Soft Logic(PSL)[Bach et. al. 2017], a powerful probabilistic programming framework, that uses weight first-order logic rules to define a probability distribution. Having defined the distribution, we use the efficient MAP inference supported by PSL to identify the most likely alignment. We performed experiments on product taxonomies extracted from four websites and compared our method to a tf-idf similarity score based approach. While the instance-based similarity score prevented aligning categories such as bicycle stands \& storage and storage \& home organization, the structural constraints helped distinguish between equivalence and more general relations. For example, beauty \& personal care was aligned to beauty and not hair care, even though there is a significant overlap of products, as hair care was the child of beauty in the product taxonomy. In summary, combining multiple scores and structural constraints using a probabilistic framework led to a 36\% improvement in precision and a 15\% improvement t in F1 score over the string similarity baseline.}, author = {Varun Embar and Jay Pujara and Lise Getoor} } @article {363, title = {Generating and Understanding Personalized Explanations in Hybrid Recommender Systems}, journal = {ACM Transactions on Interactive Intelligent Systems}, year = {2019}, abstract = {Recommender systems are ubiquitous, and shape the way users access information and make decisions. As these systems become more complex, there is a growing need for transparency and interpretability. In this paper, we study the problem of generating and visualizing personalized explanations for recommender systems which incorporate signals from many different data sources. We use a flexible, extendable probabilistic programming approach, and show how we can generate real-time personalized recommendations. We then turn these personalized recommendations into explanations. We perform an extensive user study to evaluate the benefits of explanations for hybrid recommender systems. We conduct a crowd-sourced user study where our system generates personalized recommendations and explanations for real users of the last.fm music platform. First, we evaluate the performance of the recommendations in terms of perceived accuracy and novelty. Next, we experiment with 1) different explanation styles (e.g., user-based, item-based), 2) manipulating the number of explanation styles presented, and 3) manipulating the presentation format (e.g., textual vs. visual). We also apply a mixed-model statistical analysis to consider user personality traits as a control variable and demonstrate the usefulness of our approach in creating personalized hybrid explanations with different style, number, and format. Finally, we perform a post analysis which shows different preferences for explanation styles between experienced and novice last.fm users.}, author = {Pigi Kouki and James Schaffer and Jay Pujara and John O{\textquoteright}Donovan and Lise Getoor} } @conference {342, title = {Personalized Explanations for Hybrid Recommender Systems}, booktitle = {Intelligent User Interfaces (IUI)}, year = {2019}, abstract = {Hybrid recommender systems, which combine the strength of several information sources to provide recommendations, have emerged as a means to improve the quality of recommendations. Although such systems are highly effective, they are inherently complex. As a result, providing users with a visually-appealing and useful explanation for each recommendation poses a significant challenge. In this paper, we study the problems of generating and visualizing personalized explanations from hybrid recommender systems. We build upon a hybrid probabilistic graphical model and develop an approach to generate real-time recommendations along with personalized explanations. To study the benefits of explanations for hybrid recommender systems, we conduct a crowd-sourced user study where our system generates personalized recommendations and explanations for real users of the last.fm music platform. styles. We also experiment with different presentation formats, such as textual or graphical. We experiment with 1) different explanation styles (e.g., user-based, item-based), 2) varying the volume (i.e., number) of the explanation styles, and 3) a variety of presentation formats (such as textual or visual). We apply a mixed model statistical analysis to consider the user personality traits as a control variable, and demonstrate the usefulness of our approach in creating personalized hybrid explanations with different style, volume, and format.}, author = {Pigi Kouki and James Schaffer and Jay Pujara and John Odonovan and Lise Getoor} } @conference {321, title = {Aligning Product Categories using Anchor Products}, booktitle = {Workshop on Knowledge Base Construction, Reasoning and Mining (KBCOM)}, year = {2018}, abstract = {

E-commerce sites group similar products into categories, and these categories are further organized in a taxonomy. Since different sites have different products and cater to a variety of shoppers, the taxonomies differ both in the categorization of products and the textual representation used for these categories. In this paper, we propose a technique to align categories across sites, which is useful information to have in product graphs. We use breadcrumbs present on the product pages to infer a site{\textquoteright}s taxonomy. We generate a list of candidate category pairs for alignment using anchor products products present in two or more sites. We use multiple similarity and distance metrics to compare these candidates. To generate the final set of alignments, we propose a model that combines these metrics with a set of structural constraints. The model is based on probabilistic soft logic (PSL), a scalable probabilistic programming framework. We run experiments on data extracted from Amazon, Ebay, Staples and Target, and show that the distance metric based on products, and the use of PSL to combine various metrics and structural constraints lead to improved alignments.

}, author = {Varun Embar and Golnoosh Farnadi and Jay Pujara and Lise Getoor} } @conference {329, title = {Clustering System Data using Aggregate Measures}, booktitle = {Machine Learning and Systems (MLSys)}, year = {2018}, abstract = {

Many real-world systems generate a tremendous amount of data cataloging the actions, responses, and internal states. Prominent examples include user logs on web servers, instrumentation of source code, and performance statistics in large data centers. The magnitude of this data makes it impossible to log individual events, but instead requires capturing aggregate statistics at a coarser granularity, resulting in statistical distributions instead of discrete values. We survey several popular statistical distance measures and demonstrate how appropriate statistical distances can allow meaningful clustering of web log data.

}, author = {Johnnie Chang and Robert Chen and Jay Pujara and Lise Getoor} } @article {331, title = {Collective Entity Resolution in Multi-Relational Familial Networks}, journal = {Knowledge and Information Systems (KAIS)}, volume = {61}, year = {2018}, pages = {1547-{\textendash}1581}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between co-references. Exploiting these dependencies is challenging -- it requires seamlessly combining statistical, relational, and logical dependencies.\ One task of particular interest is entity resolution\ in familial networks. \ In this setting, \ multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. \  This reconstruction is crucial for applications such as understanding genetic inheritance,\ tracking disease contagion, \ and performing census surveys.\ Here, we design a model that incorporates statistical signals (such as name similarity), relational information (such as sibling overlap), logical constraints (such as transitivity and bijective matching), and predictions from other algorithms (such as logistic regression and support vector machines), in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on real-world data, our model significantly outperforms state-of-the-art classifiers that use relational features but are incapable of collective reasoning.

}, author = {Pigi Kouki and Jay Pujara and Christopher Marcum and Laura Koehly and Lise Getoor} } @conference {328, title = {Scalable Probabilistic Causal Structure Discovery}, booktitle = {International Joint Conference on Artificial Intelligence (IJCAI)}, year = {2018}, abstract = {

Complex causal networks underlie many real-world problems, from the regulatory interactions between genes to the environmental patterns used to understand climate change. Computational methods seek to infer these casual networks using observational data and domain knowledge. In this paper, we identify three key requirements for inferring the structure of causal networks for scientific discovery: (1) robustness to noise in observed measurements; (2) scalability to handle hundreds of variables; and (3) flexibility to encode domain knowledge and other structural constraints. We first formalize the problem of joint probabilistic causal structure discovery.\ We develop an approach using probabilistic soft logic (PSL) that exploits multiple statistical tests, supports efficient optimization over hundreds of variables, and can easily incorporate structural constraints, including imperfect domain knowledge. We compare our method against multiple well-studied approaches on biological and synthetic datasets, showing improvements of up to 20\% in F1-score over the best performing baseline in realistic settings.

}, url = {https://bitbucket.org/linqs/causpsl/src/master/}, author = {Dhanya Sridhar and Pujara, Jay and Lise Getoor} } @conference {kouki:icdm17, title = {Collective Entity Resolution in Familial Networks}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, year = {2017}, note = {To Appear}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between coreferences. Exploiting these dependencies is challenging {\textendash} it requires seamlessly combining statistical, relational, and logical dependencies. One task of particular interest is entity resolution in familial networks. In this setting, multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. This reconstruction is crucial for applications such as understanding genetic inheritance, tracking disease contagion, and performing census surveys. Here, we design a model that incorporates statistical signals, such as name similarity, relational information, such as sibling overlap, and logical constraints, such as transitivity and bijective matching, in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on realworld data, our model significantly outperforms state-of-theart classifiers that use relational features but are incapable of collective reasoning. I

}, url = {https://github.com/pkouki/icdm2017}, author = {Kouki, Pigi and Pujara, Jay and Marcum, Christopher and Koehly, Laura and Lise Getoor} } @conference {tomkins:ijcai17, title = {Disambiguating Energy Disaggregation: A Collective Probabilistic Approach}, booktitle = {International Joint Conference on Artifi cial Intelligence}, year = {2017}, author = {Tomkins, Sabina and Pujara, Jay and Lise Getoor} } @conference {kim:www17, title = {Probabilistic Visitor Stitching on Cross-Device Web Logs}, booktitle = {International Conference on World Wide Web (WWW)}, year = {2017}, pages = {1581{\textendash}1589}, author = {Kim, Sungchul and Kini, Nikhil and Pujara, Jay and Koh, Eunyee and Lise Getoor} } @conference {pujara:emnlp17, title = {Sparsity and Noise: Where Knowledge Graph Embeddings Fall Short}, booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, year = {2017}, url = {https://github.com/eriq-augustine/meta-kg}, author = {Pujara, Jay and Eriq Augustine and Lise Getoor} } @conference {kouki:recsys17, title = {User Preferences for Hybrid Explanations}, booktitle = {11th ACM Conference on Recommender Systems (RecSys)}, year = {2017}, author = {Kouki, Pigi and Schaffer, James and Pujara, Jay and ODonovan, John and Lise Getoor} } @conference {sridhar:akbc17, title = {Using Noisy Extractions to Discover Causal Knowledge}, booktitle = {NIPS Workshop on Automated Knowledge Base Construction}, year = {2017}, author = {Dhanya Sridhar and Pujara, Jay and Lise Getoor} } @conference {fakhraei:mlg16, title = {Adaptive Neighborhood Graph Construction for Inference in Multi-Relational Networks}, booktitle = {KDD}, year = {2016}, month = {2016}, publisher = {ACM SIGKDD}, organization = {ACM SIGKDD}, abstract = {

A neighborhood graph, which represents the instances as vertices and their relations as weighted edges, is the basis of many semi-supervised and relational models for node labeling and link prediction. Most methods employ a sequential process to construct the neighborhood graph. This process often consists of generating a candidate graph, pruning the candidate graph to make a neighborhood graph, and then performing inference on the variables (i.e., nodes) in the neighborhood graph. In this paper, we propose a framework that can dynamically adapt the neighborhood graph based on the states of variables from intermediate inference results, as well as structural properties of the relations connecting them. A key strength of our framework is its ability to handle multi-relational data and employ varying amounts of relations for each instance based on the intermediate inference results. We formulate the link prediction task as inference on neighborhood graphs, and include preliminary results illustrating the effects of different strategies in our proposed framework.

}, author = {Shobeir Fakhraei and Sridhar Dhanya and Jay Pujara and Lise Getoor} } @conference {pujara:starai16, title = {Generic Statistical Relational Entity Resolution in Knowledge Graphs}, booktitle = {StarAI}, year = {2016}, note = {On arXiv: https://arxiv.org/abs/1607.00992}, publisher = {IJCAI 2016}, organization = {IJCAI 2016}, abstract = {

Entity resolution, the problem of identifying the underlying entity of references found in data, has been researched for many decades in many communities. A common theme in this research has been the importance of incorporating relational features into the resolution process. Relational entity resolution is particularly important in knowledge graphs (KGs), which have a regular structure capturing entities and their interrelationships. We identify three major problems in KG entity resolution: (1) intra-KG reference ambiguity; (2) inter-KG reference ambiguity; and (3) ambiguity when extending KGs with new facts. We implement a framework that generalizes across these three settings and exploits this regular structure of KGs. Our framework has many advantages over custom solutions widely deployed in industry, including collective inference, scalability, and interpretability. We apply our framework to two real-world KG entity resolution problems, ambiguity in NELL and merging data from Freebase and MusicBrainz, demonstrating the importance of relational features.

}, doi = {2016}, author = {Jay Pujara and Lise Getoor} } @mastersthesis {pujara:thesis16, title = {Probabilistic Models for Scalable Knowledge Graph Construction}, year = {2016}, school = {University of Maryland, College Park}, type = {phd}, abstract = {

In the past decade, systems that extract information from millions of Internet documents have become commonplace. Knowledge graphs -- structured knowledge bases that describe entities, their attributes and the relationships between them -- are a powerful tool for understanding and organizing this vast amount of information. However, a significant obstacle to knowledge graph construction is the unreliability of the extracted information, due to noise and ambiguity in the underlying data or errors made by the extraction system and the complexity of reasoning about the dependencies between these noisy extractions. My dissertation addresses these challenges by exploiting the interdependencies between facts to improve the quality of the knowledge graph in a scalable framework. I introduce a new approach called knowledge graph identification (KGI), which resolves the entities, attributes and relationships in the knowledge graph by incorporating uncertain extractions from multiple sources, entity co-references, and ontological constraints. I define a probability distribution over possible knowledge graphs and infer the most probable knowledge graph using a combination of probabilistic and logical reasoning. Such probabilistic models are frequently dismissed due to scalability concerns, but my implementation of KGI maintains tractable performance on large problems through the use of hinge-loss Markov random fields, which have a convex inference objective. This allows the inference of large knowledge graphs using 4M facts and 20M ground constraints in 2 hours. To further scale the solution, I develop a distributed approach to the KGI problem which runs in parallel across multiple machines, reducing inference time by 90\%. Finally, I extend my model to the streaming setting, where a knowledge graph is continuously updated by incorporating newly extracted facts. I devise a general approach for approximately updating inference in convex probabilistic models, and quantify the approximation error by defining and bounding inference regret for online models. Together, my work retains the attractive features of probabilistic models while providing the scalability necessary for large-scale knowledge graph construction. These models have been applied on a number of real-world knowledge graph projects, including the NELL project at Carnegie Mellon and the Google Knowledge Graph.

}, author = {Jay Pujara} } @conference {kumar:asonam16, title = {Unsupervised Models for Predicting Strategic Relations between Organizations}, booktitle = {ASONAM}, year = {2016}, abstract = {

Microblogging sites like Twitter provide a platform for sharing ideas and expressing opinions. The widespread popularity of these platforms and the complex social structure that arises within these communities provides a unique opportunity to understand the interactions between users. The political domain, especially in a multi-party system, presents compelling challenges, as political parties have different levels of alignment based on their political strategies. We use Twitter to understand the nuanced relationships between differing political entities in Latin America. Our model incorporates diverse signals from the content of tweets and social context from retweets, mentions and hashtag usage. Since direct communications between entities are relatively rare, we explore models based on the posts of users who interact with multiple political organizations. We present a quantitative and qualitative analysis of the results of models using different features, and demonstrate that a model capable of using sentiment strength, social context, and issue alignment has superior performance to less sophisticated baselines.

}, author = {Shachi Kumar and Jay Pujara and Lise Getoor and David Mares and Dipak Gupta and Ellen Riloff} } @conference {pujara:uai15, title = {Budgeted Online Collective Inference}, booktitle = {UAI}, year = {2015}, abstract = {

Updating inference in response to new evidence is a fundamental challenge in artificial intelligence. Many real problems require large probabilistic graphical models, containing millions of interdependent variables. For such large models, jointly updating the most likely (i.e., MAP) configuration of the variables each time new evidence is encountered can be infeasible, even if inference is tractable. In this paper, we introduce budgeted online collective inference, in which the MAP configuration of a graphical model is updated efficiently by revising the assignments to a subset of the variables while holding others fixed. The goal is to selectively update certain variables without sacrificing quality with respect to full inference. To formalize the consequences of partially updating inference, we introduce the concept of inference regret. We derive inference regret bounds for a class of graphical models with strongly-convex free energies. These theoretical insights, combined with a thorough analysis of the optimization solver, motivate new approximate methods for efficiently updating the variable assignments under a budget constraint. In experiments, we demonstrate that our algorithms can reduce inference time by 65\% with accuracy comparable to full inference.

}, author = {Jay Pujara and Ben London and Lise Getoor} } @conference {pujara:starai15, title = {Online Inference for Knowledge Graph Construction.}, booktitle = {Workshop on Statistical Relational AI}, year = {2015}, author = {Pujara, Jay and London, Ben and Lise Getoor and Cohen, William} } @conference {grycner:emnlp15, title = {RELLY: Inferring Hypernym Relationships Between Relational Phrases}, booktitle = {Conference on Empirical Methods in Natural Language Processing}, year = {2015}, author = {Grycner, Adam and Weikum, Gerhard and Pujara, Jay and Foulds, James and Lise Getoor} } @article {pujara:aimag15, title = {Using Semantics \& Statistics to Turn Data into Knowledge}, journal = {AI Magazine}, volume = {36}, number = {1}, year = {2015}, pages = {65{\textendash}74}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {grycner:akbc2014, title = {A Unified Probabilistic Approach for Semantic Clustering of Relational Phrases}, booktitle = {NeurIPS}, year = {2014}, abstract = {

The task of finding synonymous relational phrases is important in natural language understanding problems such as question answering and paraphrase detection. While this task has been addressed by many previous systems, each of these existing approaches is limited either in expressivity or in scalability. To address this challenge, we present a large-scale statistical relational method for clustering relational phrases using Probabilistic Soft Logic (PSL) [1]. To assess the quality of our approach, we evaluated it relative to a set of baseline methods. The proposed technique was found to outperform the baselines for both clustering and link prediction, and was shown to be scalable enough to be applied to 200,000relational phrases.

}, author = {Adam Grycner and Gerhard Weikum and Jay Pujara and James Foulds and Lise Getoor} } @conference {pujara:akbc14, title = {Building Dynamic Knowledge Graphs}, booktitle = {NIPS Workshop on Automated Knowledge Base Construction}, year = {2014}, author = {Pujara, Jay and Lise Getoor} } @conference {pujara:wtbudg13, title = {Joint Judgments with a Budget: Strategies for Reducing the Cost of Inference}, booktitle = {ICML Workshop on Machine Learning with Test-Time Budgets}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor} } @conference {pujara:iswc13, title = {Knowledge Graph Identification}, booktitle = {International Semantic Web Conference (ISWC)}, year = {2013}, note = {Winner of Best Student Paper award}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:slg13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {ICML Workshop on Structured Learning (SLG)}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:sbd13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {AAAI Fall Symposium on Semantics for Big Data}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:akbc13, title = {Ontology-Aware Partitioning for Knowledge Graph Identification}, booktitle = {CIKM Workshop on Automatic Knowledge Base Construction}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:nips12, title = {Large-Scale Hierarchical Topic Models}, booktitle = {NIPS Workshop on BigLearn}, year = {2012}, abstract = {

In the past decade, a number of advances in topic modeling have produced sophisticated models that are capable of generating hierarchies of topics. One challenge for these models is scalability: they are incapable of working at the massive scale of millions of documents and hundreds of thousands of terms. We address this challenge with a technique that learns a hierarchy of topics by iteratively applying topic models and processing subtrees of the hierarchy in parallel. This approach has a number of scalability advantages compared to existing techniques, and shows promising results in experiments assessing runtime and human evaluations of quality. We detail extensions to this approach that may further improve hierarchical topic modeling for large-scale applications.

}, author = {Jay Pujara and Peter Skomoroch} } @article {getoor:tkde12b, title = {TACI: Taxonomy-Aware Catalog Integration}, journal = {TKDE}, volume = {25}, year = {2012}, chapter = {1643--1655}, abstract = {

A fundamental data integration task faced by online commercial portals and commerce search engines is the integration of products coming from multiple providers to their product catalogs. In this scenario, the commercial portal has its own taxonomy (the {\textquotedblleft}master taxonomy{\textquotedblright}), while each data provider organizes its products into a different taxonomy (the {\textquotedblleft}provider taxonomy{\textquotedblright}). In this paper, we consider the problem of categorizing products from the data providers into the master taxonomy, while making use of the provider taxonomy information. Our approach is based on a taxonomy-aware processing step that adjusts the results of a text-based classifier to ensure that products that are close together in the provider taxonomy remain close in the master taxonomy. We formulate this intuition as a structured prediction optimization problem. To the best of our knowledge, this is the first approach that leverages the structure of taxonomies in order to enhance catalog integration. We propose algorithms that are scalable and thus applicable to the large datasets that are typical on the Web. We evaluate our algorithms on real-world data and we show that taxonomy-aware classification provides a significant improvement over existing approaches.

}, author = {Papadimitriou Panagiotis and Tsaparas Panayiotis and Fuxman Ariel and Lise Getoor} } @conference {plangprasopchok:wsdm2011, title = {A Probabilistic Approach for Learning Folksonomies from Structured Data}, booktitle = {Fourth ACM International Conference on Web Search and Data Mining (WSDM)}, year = {2011}, author = {Plangprasopchok, Anon and Lerman, Kristina and Lise Getoor} } @conference {pujara:icmlws11, title = {Reducing Label Cost by Combining Feature Labels and Crowdsourcing}, booktitle = {ICML Workshop on Combining Learning Strategies to Reduce Label Cost}, year = {2011}, author = {Pujara, Jay and London, Ben and Lise Getoor} } @conference {pujara:ceas11, title = {Using Classifier Cascades for Scalable E-Mail Classification}, booktitle = {Collaboration, Electronic Messaging, Anti-Abuse and Spam Conference}, series = {ACM International Conference Proceedings Series}, year = {2011}, note = {Winner of a Best Paper award}, publisher = {ACM}, organization = {ACM}, author = {Pujara, Jay and Daume III, Hal and Lise Getoor} } @conference {pujara:nips10, title = {Coarse-to-Fine, Cost-Sensitive Classification of E-Mail}, booktitle = {NIPS Workshop on Coarse-to-Fine Processing}, year = {2010}, author = {Pujara, Jay and Lise Getoor} } @conference {plang:kdd10, title = {Growing a tree in the forest: constructing folksonomies by integrating structured metadata}, booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2010}, author = {Plangprasopchok, Anon and Lerman, Kristina and Lise Getoor} } @article {Polymeropoulos:SchizRes09, title = {Common effect of antipsychotics on the biosynthesis and regulation of fatty acids and cholesterol supports a key role of lipid homeostasis in schizophrenia.}, journal = {Schizophrenia Research}, year = {2009}, keywords = {bioinformatics gene expression analysis antipsychotic pharmacogenetics}, author = {Polymeropoulos, Mihales and Licamele, Louis and Volpi, Simona and Mack, Kendra and Mitkus, Shruti and Carstea, Eugene and Lise Getoor and Lavedan, Christian} } @conference {schnaitter:vldb09, title = {Index Interactions in Physical Design Tuning: Modeling, Analysis, and Applications}, booktitle = {International Conference on Very Large Data Bases}, year = {2009}, author = {Schnaitter, Karl and Polyzotis, Neoklis and Lise Getoor} } @book {getoor:prm-ch-srl-book07, title = {Probabilistic Relational Models}, series = {An Introduction to Statistical Relational Learning}, volume = {1}, year = {2007}, pages = {129--174}, publisher = {MIT Press}, organization = {MIT Press}, edition = {1}, chapter = {5}, abstract = {

Probabilistic relational models (PRMs) are a rich representation language for structured statistical models. They combine a frame-based logical representation with probabilistic semantics based on directed graphical models (Bayesian networks). This chapter gives an introduction to probabilistic relational models, describing semantics for attribute uncertainty, structural uncertainty, and class uncertainty. For each case, learning algorithms and some sample results are presented.

}, author = {Lise Getoor and Nir Friedman and Daphne Koller and Avi Pfeffer and Benjamin Taskar}, editor = {Lise Getoor and Benjamin Taskar} } @conference {kddpanel06, title = {Is there a grand challenge or X-prize for data mining?}, booktitle = {12th International Conference on Knowledge Discovery and Data Mining}, year = {2006}, author = {Piatetsky-Shapiro, Gregory and Grossman, Robert and Djeraba, Chabane and Feldman, Ronen and Lise Getoor and Zaki, Mohammed} } @book {getoor:rdm-book01, title = {Learning Probabilistic Relational Models}, series = {Relational Data Mining}, volume = {1}, year = {2001}, pages = {307--335}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, edition = {1}, chapter = {13}, abstract = {

Probabilistic relational models (PRMs) are a language for describing statistical models over typed relational domains. A PRM models the uncertainty over the attributes of objects in the domain and uncertainty over the relations between the objects. The model specifies, for each attribute of an object, its (probabilistic) dependence on other attributes of that object and on attributes of related objects. The dependence model is defined at the level of classes of objects. The class dependence model is instantiated for any object in the class, as appropriate to the particular context of the object (i.e., the relations between this objects and others). PRMs can also represent uncertainty over the relational structure itself, e.g., by specifying a (class-level) probability that two objects will be related to each other. PRMs provide a foundation for dealing with the noise and uncertainty encountered in most real-world domains. In this chapter, we show that the compact and natural representation of PRMs allows them to be learned directly from an existing relational database using well-founded statistical techniques. We give an introduction to PRMs and an overview of methods for learning them. We show that PRMs provide a new framework for relational data mining, and offer new challenges for the endeavor of learning relational models for real-world domains.

}, author = {Lise Getoor and Nir Friedman and Daphne Koller and Avi Pfeffer}, editor = {Saso Dzeroski and Nada Lavrac} } @conference {325, title = {Learning Probabilistic Relational Models}, booktitle = {Relational Data Mining}, year = {2001}, month = {2001}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, author = {Lise Getoor and Friedman, Nir and Koller, Daphne and Pfeffer, Avi} } @conference {friedman:ijcai99, title = {Learning Probabilistic Relational Models}, booktitle = {International Joint Conference on Arti cial Intelligence}, year = {1999}, author = {Friedman, Nir and Lise Getoor and Koller, Daphne and Pfeffer, Avi} }