@conference {key355, title = {BOWL: Bayesian Optimization for Weight Learning in Probabilistic Soft Logic}, booktitle = {AAAI Conference on Artificial Intelligence (AAAI)}, year = {2020}, abstract = {Probabilistic soft logic (PSL) is a statistical relational learning framework that represents complex relational models with weighted first-order logical rules. The weights of the rules in PSL indicate their importance in the model and influence the effectiveness of the model on a given task. Existing weight learning approaches often attempt to learn a set of weights that maximizes some function of data likelihood. However, this does not always translate to optimal performance on a desired domain metric, such as accuracy or F1 score. In this paper, we introduce a new weight learning approach called Bayesian optimization for weight learning (BOWL) based on Gaussian process regression that directly optimizes weights on a chosen domain performance metric. The key to the success of our approach is a novel projection that captures the semantic distance between the possible weight configurations. Our experimental results show that our proposed approach outperforms likelihood-based approaches and yields up to a 10\% improvement across a variety of performance metrics. Further, we performed experiments to measure the scalability and robustness of our approach on various real world datasets.}, author = {Sriram Srinivasan and Golnoosh Farnadi and Lise Getoor} } @conference {359, title = {Causal Relational Learning}, booktitle = {International Conference on Management of Data (SIGMOD)}, year = {2020}, abstract = {Causal inference is at the heart of empirical research in natural and social sciences and is critical for scientific discovery and informed decision making. The gold standard in causal inference is performing randomized controlled trials; unfortunately these are not always feasible due to ethical, legal, or cost constraints. As an alternative, methodologies for causal inference from observational data have been developed in statistical studies and social sciences. However, existing methods critically rely on restrictive assumptions such as the study population consisting of homogeneous elements that can be represented in a single flat table, where each row is referred to as a unit. In contrast, in many real-world settings, the study domain naturally consists of heterogeneous elements with complex relational structure, where the data is naturally represented in multiple related tables. In this paper, we present a formal framework for causal inference from such relational data.We propose a declarative language called CaRL for capturing causal background knowledge and assumptions, and specifying causal queries using simple Datalog-like rules. CaRL provides a foundation for inferring causality and reasoning about the effect of complex interventions in relational domains.We present an extensive experimental evaluation on real relational data to illustrate the applicability of CaRL in social sciences and healthcare.}, author = {Babak Salami and Harsh Parikh and Moe Kayali and Sudeepa Roy and Lise Getoor and Dan Suciu} } @conference {360, title = {Contrastive Entity Linkage: Mining Variational Attributes from Large Catalogs for Entity Linkage}, booktitle = {Automated Knowledge Base Construction (AKBC)}, year = {2020}, abstract = {Presence of near identical, but distinct, entities called entity variations makes the task of data integration challenging. For example, in the domain of grocery products, variations share the same value for attributes such as brand, manufacturer and product line, but differ in other attributes, called variational attributes, such as package size and color. Identifying variations across data sources is an important task in itself and is crucial for identifying duplicates. However, this task is challenging as the variational attributes are often present as a part of unstructured text and are domain dependent. In this work, we propose our approach, Contrastive entity linkage, to identify both entity pairs that are the same and pairs that are variations of each other. We propose a novel unsupervised approach, VarSpot, to mine domain-dependent variational attributes present in unstructured text. The proposed approach reasons about both similarities and differences between entities and can easily scale to large sources containing millions of entities. We show the generality of our approach by performing experimental evaluation on three different domains. Our approach significantly outperforms state-of-the-art learning-based and rule-based entity linkage systems by up to 4\% F1 score when identifying duplicates, and up to 41\% when identifying entity variations.}, author = {Varun Embar and Bunyamin Sisman and Hao Wei and Xin Luna Dong and Christos Faloutsos and Lise Getoor} } @conference {357, title = {Estimating Aggregate Properties In Relational Networks With Unobserved Data}, booktitle = {AAAI Workshop on Statistical Relational Artificial Intelligence (StarAI)}, year = {2020}, abstract = {Aggregate network properties such as cluster cohesion and the number of bridge nodes can be used to glean insights about a network{\textquoteright}s community structure, spread of influence and the resilience of the network to faults. Efficiently computing network properties when the network is fully observed has received significant attention (Wasserman and Faust 1994; Cook and Holder 2006), however the problem of computing aggregate network properties when there is missing data attributes has received little attention. Computing these properties for networks with missing attributes involves performing inference over the network. Statistical relational learning (SRL) and graph neural networks (GNNs) are two classes of machine learning approaches well suited for inferring missing attributes in a graph. In this paper, we study the effectiveness of these approaches in estimating aggregate properties on networks with missing attributes. We compare two SRL approaches and three GNNs. For these approaches we estimate these properties using point estimates such as MAP and mean. For SRL-based approaches that can infer a joint distribution over the missing attributes, we also estimate these properties as an expectation over the distribution. To compute the expectation tractably for probabilistic soft logic, one of the SRL approaches that we study, we introduce a novel sampling framework. In the experimental evaluation, using three benchmark datasets, we show that SRL-based approaches tend to outperform GNN-based approaches both in computing aggregate properties and predictive accuracy. Specifically, we show that estimating the aggregate properties as an expectation over the joint distribution outperforms point estimates. }, author = {Varun Embar and Sriram Srinivasan and Lise Getoor} } @conference {356, title = {Tandem Inference: An Out-of-Core Streaming Algorithm For Very Large-Scale Relational Inference}, booktitle = {AAAI Conference on Artificial Intelligence (AAAI)}, year = {2020}, abstract = {Statistical relational learning (SRL) frameworks allow users to create large, complex graphical models using a compact, rule-based representation. However, these models can quickly become prohibitively large and not fit into machine memory. In this work we address this issue by introducing a novel technique called tandem inference (TI). The primary idea of TI is to combine grounding and inference such that both processes happen in tandem. TI uses an out-of-core streaming approach to overcome memory limitations. Even when memory is not an issue, we show that our proposed approach is able to do inference faster while using less memory than existing approaches. To show the effectiveness of TI, we use a popular SRL framework called Probabilistic Soft Logic (PSL). We implement TI for PSL by proposing a gradient-based inference engine and a streaming approach to grounding. We show that we are able to run an SRL model with over 1B cliques in under nine hours and using only 10 GB of RAM; previous approaches required more than 800 GB for this model and are infeasible on common hardware. To the best of our knowledge, this is the largest SRL model ever run. }, author = {Sriram Srinivasan and Eriq Augustine and Lise Getoor} } @article {353, title = {A Collective, Probabilistic Approach to Schema Mapping Using Diverse Noisy Evidence}, journal = {IEEE Transactions on Knowledge and Data Engineering (TKDE)}, volume = {31}, year = {2019}, pages = {1426--1439}, abstract = {We propose a probabilistic approach to the problem of schema mapping. Our approach is declarative, scalable, and extensible. It builds upon recent results in both schema mapping and probabilistic reasoning and contributes novel techniques in both fields. We introduce the problem of schema mapping selection, that is, choosing the best mapping from a space of potential mappings, given both metadata constraints and a data example. As selection has to reason holistically about the inputs and the dependencies between the chosen mappings, we define a new schema mapping optimization problem which captures interactions between mappings as well as inconsistencies and incompleteness in the input. We then introduce Collective Mapping Discovery (CMD), our solution to this problem using state-of-the-art probabilistic reasoning techniques. Our evaluation on a wide range of integration scenarios, including several real-world domains, demonstrates that CMD effectively combines data and metadata information to infer highly accurate mappings even with significant levels of noise.}, keywords = {Cognition, Complexity theory, Data engineering, Knowledge engineering, Metadata, Probabilistic logic, Schema mapping, Task analysis, collective mapping discovery, data integration, inference mechanisms, meta data, optimisation, optimization, potential mappings, probabilistic reasoning techniques, probability, schema mapping optimization problem, uncertainty handling}, doi = {10.1109/TKDE.2018.2865785}, author = {Angelika Kimmig and Alex Memory and Renee J Miller and Lise Getoor} } @article {354, title = {A Declarative Approach to Fairness in Relational Domains}, journal = {IEEE Data Engineering Bulletin}, volume = {42}, year = {2019}, pages = {36--48}, abstract = {AI and machine learning tools are being used with increasing frequency for decision making in domains that affect peoples{\textquoteright} lives such as employment, education, policing and financial qualifications. These uses raise concerns about biases of algorithmic discrimination and have motivated the development of fairness-aware machine learning. However, existing fairness approaches are based solely on attributes of individuals. In many cases, discrimination is much more complex, and taking into account the social, organizational, and other connections between individuals is important. We introduce new notions of fairness that are able to capture the relational structure in a domain. We use first-order logic to provide a flexible and expressive language for specifying complex relational patterns of discrimination. Furthermore, we extend an existing statistical relational learning framework, probabilistic soft logic (PSL), to incorporate our definition of relational fairness. We refer to this fairness-aware framework FairPSL. FairPSL makes use of the logical definitions of fairnesss but also supports a probabilistic interpretation. In particular, we show how to perform maximum a posteriori (MAP) inference by exploiting probabilistic dependencies within the domain while avoiding violations of fairness guarantees. Preliminary empirical evaluation shows that we are able to make both accurate and fair decisions.}, author = {Golnoosh Farnadi and Behrouz Babaki and Lise Getoor} } @conference {349, title = {Collective Alignment of Large-scale Ontologies}, booktitle = {AKBC Workshop on Federated Knowledge Bases (FKBs)}, year = {2019}, abstract = {The rapid growth in digitization of data has led to creation of fragmented but vital knowledge sources. Ontologies are one such crucial source of knowledge and aligning them is a key challenge for creating an Open Knowledge Network. The task of ontology alignment has received significant attention. In this abstract, we building on existing work, and propose a novel probabilistic ontology alignment approach that combines several similarity measures with structural information such as subsumption and mutual exclusion. Most large-scale ontologies such as product catalogs [Agrawal et. al. 2001] and folksonomies [Plangprasopchok et. al. 2010] do not have a formally defined ontology with well-defined classes, instances and properties. Instead, they loosely define relationships such as subsumption between various entities. For example, a folksonomy for Instagram would contain not only tags corresponding to people, places and activities but also tags such as Selfie, which correspond to a type of image. Product catalogs have very different textual representation for the same entity. For instance, products related to 3D printing are present in a category called 3D Printing \& Supplies on Ebay, while the same products are present in a category called Additive Manufacturing Products on Amazon. Moreover, the same textual representation might have different semantics based on the source of the ontology. The category Headphones in an ontology corresponding to a particular company is different from the Headphones category of a large e-commerce retailer such as Amazon. Even aligning tracks in a music catalog is considerably challenging as it is unclear whether the tracks Bohemian Rhapsody OST and Bohemian Rhapsody Remastered 201l are the same. To sum up, ontology alignment is challenging due to informally defined subsumptions, multiple textual representations for the same class, ambiguity of similar textual representations and presence of large number of instance variations . Existing ontology alignment approached can be classified into schema-based approaches, instance-based approaches and hybrid approaches [Euzenat et. al. 2007]. Hybrid approaches such as InformationFlow-based Map [Kalfoglou et. al., 2003] combines string-based heuristics and the structure of the ontology to generate alignments. Naive Ontology Mapping [Ehrig et. al., 2004] makes uses of rules that exploit information present in the ontology. Motivated by these hybrid methods, our proposed ontology alignment approach combines several similarity and distance scores with soft structural constraints. We then define a probability distribution over the set of all possible alignments that takes into account correlations between different alignments. Apart from similarity scores computed on the textual representation of entities, we also compute scores using the entity hierarchy described by the subsumption relations. This helps in identifying the semantics on each entity. Apart from structural constraints such as mutual exclusion, we also incorporate relation specific constraints. For instance, it is unlikely that multiple entities that have a parent-child relationship align to a single entity. We use Probabilistic Soft Logic(PSL)[Bach et. al. 2017], a powerful probabilistic programming framework, that uses weight first-order logic rules to define a probability distribution. Having defined the distribution, we use the efficient MAP inference supported by PSL to identify the most likely alignment. We performed experiments on product taxonomies extracted from four websites and compared our method to a tf-idf similarity score based approach. While the instance-based similarity score prevented aligning categories such as bicycle stands \& storage and storage \& home organization, the structural constraints helped distinguish between equivalence and more general relations. For example, beauty \& personal care was aligned to beauty and not hair care, even though there is a significant overlap of products, as hair care was the child of beauty in the product taxonomy. In summary, combining multiple scores and structural constraints using a probabilistic framework led to a 36\% improvement in precision and a 15\% improvement t in F1 score over the string similarity baseline.}, author = {Varun Embar and Jay Pujara and Lise Getoor} } @conference {351, title = {Estimating Causal Effects of Tone in Online Debates}, booktitle = {International Joint Conference on Artificial Intelligence (IJCAI)}, year = {2019}, abstract = {Statistical methods applied to social media posts shed light on the dynamics of online dialogue. For example, users{\textquoteright} wording choices predict their persuasiveness and users adopt the language patterns of other dialogue participants. In this paper, we estimate the causal effect of reply tones in debates on linguistic and sentiment changes in subsequent responses. The challenge for this estimation is that a reply{\textquoteright}s tone and subsequent responses are confounded by the users{\textquoteright} ideologies on the debate topic and their emotions. To overcome this challenge, we learn representations of ideology using generative models of text.vWe study debates from 4Forums.com and compare annotated tones of replying such as emotional versus factual, or reasonable versus attacking. We show that our latent confounder representation reduces bias in ATE estimation. Our results suggest that factual and asserting tones affect dialogue and provide a methodology for estimating causal effects from text. }, author = {Dhanya Sridhar and Lise Getoor} } @article {363, title = {Generating and Understanding Personalized Explanations in Hybrid Recommender Systems}, journal = {ACM Transactions on Interactive Intelligent Systems}, year = {2019}, abstract = {Recommender systems are ubiquitous, and shape the way users access information and make decisions. As these systems become more complex, there is a growing need for transparency and interpretability. In this paper, we study the problem of generating and visualizing personalized explanations for recommender systems which incorporate signals from many different data sources. We use a flexible, extendable probabilistic programming approach, and show how we can generate real-time personalized recommendations. We then turn these personalized recommendations into explanations. We perform an extensive user study to evaluate the benefits of explanations for hybrid recommender systems. We conduct a crowd-sourced user study where our system generates personalized recommendations and explanations for real users of the last.fm music platform. First, we evaluate the performance of the recommendations in terms of perceived accuracy and novelty. Next, we experiment with 1) different explanation styles (e.g., user-based, item-based), 2) manipulating the number of explanation styles presented, and 3) manipulating the presentation format (e.g., textual vs. visual). We also apply a mixed-model statistical analysis to consider user personality traits as a control variable and demonstrate the usefulness of our approach in creating personalized hybrid explanations with different style, number, and format. Finally, we perform a post analysis which shows different preferences for explanation styles between experienced and novice last.fm users.}, author = {Pigi Kouki and James Schaffer and Jay Pujara and John O{\textquoteright}Donovan and Lise Getoor} } @conference {352, title = {Identifying Facet Mismatches In Search Via Micrographs}, booktitle = {International Conference on Information and Knowledge Management (CIKM)}, year = {2019}, abstract = {E-commerce search engines are the primary means by which customers shop for products online. Each customer query contains multiple facets such as product type, color, brand, etc. A successful search engine retrieves products that are relevant to the query along each of these attributes. However, due to lexical (erroneous title, description, etc.) and behavioral irregularities (clicks or purchases of products that do not belong to the same facet as the query), some mismatched products are shown in the search results. These irregularities are often detected using simple binary classifiers like gradient boosted decision trees or logistic regression. Typically, these binary classifiers use strong independence assumptions between the samples and ignore structural relationships available in the data, such as the connections between products and queries. In this paper, we use the connections that exist between products and query to identify a special kind of structure we refer to as a micrograph. Further, we make use of Statistical Relational Learning (SRL) to incorporate these micrographs in the data and pose the problem as a structured prediction problem. We refer to this approach as structured mismatch classification (smc). In addition, we show that naive addition of structure does not improve the performance of the model and hence introduce a variation of smc, strong smc (s2mc), which improves over the baseline by passing information from high-confidence predictions to lower confidence predictions. In our empirical evaluation we show that our proposed approach outperforms the baseline classification methods by up to 12\% in precision. Furthermore, we use quasi-Newton methods to make our method viable for real-time inference in a search engine and show that our approach is up to 150 times faster than existing ADMM-based solvers.}, keywords = {collective classification, defect, probabilistic soft logic, search, statistical relational language, structured prediction}, author = {Sriram Srinivasan and Nikhil S Rao and Karthik Subbaian and Lise Getoor} } @article {343, title = {Interpretable Engagement Models for MOOCs using Hinge-loss Markov Random Fields}, journal = {IEEE Transactions on Learning Technologies (TLT)}, volume = {14}, year = {2019}, pages = {1-1}, chapter = {1}, abstract = {Maintaining and cultivating student engagement is critical for learning. Understanding factors affecting student engagement can help in designing better courses and improving student retention. The large number of participants in massive open online courses (MOOCs) and data collected from their interactions on the MOOC open up avenues for studying student engagement at scale. In this work, we develop an interpretable statistical relational learning model for understanding student engagement in online courses using a complex combination of behavioral, linguistic, structural, and temporal cues. We show how to abstract student engagement types of active, passive, and disengagement as meaningful latent variables using logical rules in our model connecting student behavioral signals with student success in MOOCs. We demonstrate that the latent formulation for engagement helps in predicting two measures of student success: performance, their final grade in the course, and survival, their continued presence in the course till the end, across seven MOOCs. Further, in order to initiate better instructor interventions, we need to be able to predict student success early in the course. We demonstrate that we can predict student success early in the course reliably using the latent model. We also demonstrate the utility of our models in predicting student success in new courses, by training our models on one course and testing on another course. We show that the latent abstractions are helpful in predicting student success and engagement reliably in new MOOCs that haven{\textquoteright}t yet gathered student interaction data. We then perform a closer quantitative analysis of different features derived from student interactions on the MOOC and identify student activities that are good indicators of student success at different points in the course. Through a qualitative analysis of the latent engagement variable values, we demonstrate their utility in understanding students{\textquoteright} engagement levels at various points in the course and movement of students across different types of engagement.}, author = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal Daume III and Lise Getoor} } @conference {341, title = {Lifted Hinge-Loss Markov Random Fields}, booktitle = {AAAI Conference on Artificial Intelligence (AAAI)}, year = {2019}, month = {11/2018}, abstract = {Statistical relational learning models are powerful tools that combine ideas from first-order logic with probabilistic graphical models to represent complex dependencies. Despite their success in encoding large problems with a compact set of weighted rules, performing inference over these models is often challenging. In this paper, we show how to effectively combine two powerful ideas for scaling inference for large graphical models. The first idea, lifted inference, is a wellstudied approach to speeding up inference in graphical models by exploiting symmetries in the underlying problem. The second idea is to frame Maximum a posteriori (MAP) inference as a convex optimization problem and use alternating direction method of multipliers (ADMM) to solve the problem in parallel. A well-studied relaxation to the combinatorial optimization problem defined for logical Markov random fields gives rise to a hinge-loss Markov random field (HLMRF) for which MAP inference is a convex optimization problem. We show how the formalism introduced for coloring weighted bipartite graphs using a color refinement algorithm can be integrated with the ADMM optimization technique to take advantage of the sparse dependency structures of HLMRFs. Our proposed approach, lifted hinge-loss Markov random fields (LHL-MRFs), preserves the structure of the original problem after lifting and solves lifted inference as distributed convex optimization with ADMM. In our empirical evaluation on real-world problems, we observe up to a three times speed up in inference over HL-MRFs.}, author = {Sriram Srinivasan and Behrouz Babaki and Golnoosh Farnadi and Lise Getoor} } @conference {342, title = {Personalized Explanations for Hybrid Recommender Systems}, booktitle = {Intelligent User Interfaces (IUI)}, year = {2019}, abstract = {Hybrid recommender systems, which combine the strength of several information sources to provide recommendations, have emerged as a means to improve the quality of recommendations. Although such systems are highly effective, they are inherently complex. As a result, providing users with a visually-appealing and useful explanation for each recommendation poses a significant challenge. In this paper, we study the problems of generating and visualizing personalized explanations from hybrid recommender systems. We build upon a hybrid probabilistic graphical model and develop an approach to generate real-time recommendations along with personalized explanations. To study the benefits of explanations for hybrid recommender systems, we conduct a crowd-sourced user study where our system generates personalized recommendations and explanations for real users of the last.fm music platform. styles. We also experiment with different presentation formats, such as textual or graphical. We experiment with 1) different explanation styles (e.g., user-based, item-based), 2) varying the volume (i.e., number) of the explanation styles, and 3) a variety of presentation formats (such as textual or visual). We apply a mixed model statistical analysis to consider the user personality traits as a control variable, and demonstrate the usefulness of our approach in creating personalized hybrid explanations with different style, volume, and format.}, author = {Pigi Kouki and James Schaffer and Jay Pujara and John Odonovan and Lise Getoor} } @conference {350, title = {Tractable Marginal Inference for Hinge-Loss Markov Random Fields}, booktitle = {ICML Workshop on Tractable Probabilistic Modeling (TPM)}, year = {2019}, month = {06/2019}, abstract = {Hinge-loss Markov random fields (HL-MRFs) are a class of undirected graphical models that has been successfully applied to model richly structured data. HL-MRFs are defined over a set of continuous random variables in the range [0,1], which makes computing the MAP convex. However, computation of marginal distributions remain intractable. In this paper, we introduce a novel sampling-based algorithm to compute marginal distributions. We define the notion of association blocks, which help identify islands of high probability, and propose a novel approach to sample from these regions. We validate our approach by estimating both average precision and various properties of a social network. We show that the proposed approach outperforms MAP estimates in both average precision and the accuracy of the properties by 20\% and 40\% respectively on the large social network.}, author = {Varun Embar and Sriram Srinivasan and Lise Getoor} } @conference {348, title = {Tractable Probabilistic Reasoning Through Effective Grounding}, booktitle = {ICML Workshop on Tractable Probabilistic Modeling (TPM)}, year = {2019}, abstract = {Templated Statistical Relational Learning languages, such as Markov Logic Networks (MLNs) and Probabilistic Soft Logic (PSL), offer much of the expressivity of probabilistic graphical models in a compact form that is intuitive to both experienced modelers and domain experts. However, these languages have historically suffered from tractability issues stemming from the large size of the instantiated models and the complex joint inference performed over these models. Although much research has gone into improving the tractability of these languages using approximate or lifted inference, a relatively small amount of research has gone into improving tractability through efficient instantiation of these large models. In this position paper, we will draw attention to open research areas around efficiently instantiating templated probabilistic models.}, author = {Eriq Augustine and Theodoros Rekatsinas and Lise Getoor} } @article {358, title = {Understanding Hybrid-MOOC Effectiveness with a Collective Socio-Behavioral Model}, journal = {Journal of Educational Data Mining (JEDM)}, volume = {11}, year = {2019}, pages = {42--77}, abstract = {Online courses for high school students promise the opportunity to bring critical education to youth most at need, bridging gaps which may exist in brick-and-mortar institutions. In this work, we investigate a hybrid Massive Open Online Course for high schoolers which includes an in-person coaching component. We address the efficacy of these courses and the contribution of in-person coaching. We first analyze features of student behavior and their effect on post-test performance and then propose a novel probabilistic model for inferring student success on an AP exam post-test. Our proposed model exploits relationships between students to collectively infer student success. When these relationships are not directly observed, we formulate latent constructs to capture social dynamics of learning. By collectively inferring student success as a function of both unobserved individual characteristics and relational dynamics, we improve predictive performance by up to 6.8\% over an SVM model with only observable features. We propose this general socio-behavioral modeling framework as a flexible approach for including unobserved aspects of learning in meaningful ways, in order to better understand and infer student success.}, doi = {10.5281/zenodo.3594773}, url = {https://doi.org/10.5281/zenodo.3594773}, author = {Sabina Tomkins and Lise Getoor} } @conference {326, title = {A Comparison of Bottom-Up Approaches to Grounding for Templated Markov Random Fields}, booktitle = {Machine Learning and Systems (MLSys)}, year = {2018}, url = {https://github.com/eriq-augustine/grounding-experiments}, author = {Eriq Augustine and Lise Getoor} } @conference {337, title = {A Fairness-aware Hybrid Recommender System}, booktitle = {RecSys Workshop on Responsible Recommendation (FATREC)}, year = {2018}, abstract = {

Recommender systems are used in variety of domains affecting people{\textquoteright}s lives. This has raised concerns about possible biases and discrimination that such systems might exacerbate. There are two primary kinds of biases inherent in recommender systems: observation bias and bias stemming from imbalanced data. Observation bias exists due to a feedback loop which causes the model to learn to only predict recommendations similar to previous ones. Imbalance in data occurs when systematic societal, historical, or other ambient bias is present in the data. In this paper, we address both biases by proposing a hybrid fairness-aware recommender system. Our model provides efficient and accurate recommendations by incorporating multiple user-user and item-item similarity measures, content, and demographic information, while addressing recommendation biases. We implement our model using a powerful and expressive probabilistic programming language called probabilistic soft logic. We experimentally evaluate our approach on a popular movie recommendation dataset, showing that our proposed model can provide more accurate and fairer recommendations, compared to a state-of-the art fair recommender system.

}, author = {Golnoosh Farnadi and Kouki, Pigi and Spencer K. Thompson and Sriram Srinivasan and Lise Getoor} } @conference {333, title = {A Socio-linguistic Model for Cyberbullying Detection}, booktitle = {International Conference on Advances in Social Networks Analysis and Mining (ASONAM)}, year = {2018}, abstract = {

Cyberbullying is a serious threat to both the short and long-term well-being of social media users. Addressing this problem in online environments demands the ability to automatically detect cyberbullying and to identify the roles that participants assume in social interactions. As cyberbullying occurs within online communities, it is also vital to understand the group dynamics that support bullying behavior. To this end, we propose a socio-linguistic model which jointly detects cyberbullying content in messages, discovers latent text categories, identifies participant roles and exploits social interactions. While our method makes use of content that is labeled as bullying, it does not require category, role or relationship labels. Furthermore, as bullying labels are often subjective, noisy and inconsistent, an important contribution of our paper is effective methods for leveraging inconsistent labels. Rather than discard inconsistent labels, we evaluate different methods for learning from them, demonstrating that incorporating uncertainty allows for better generalization. Our proposed socio-linguistic model achieves an 18\% improvement over state-of-the-art methods.

}, author = {Tomkins, Sabina and Lise Getoor and Chen, Yunfei and Zhang, Yi} } @conference {323, title = {A Structured Approach to Understanding Recovery and Relapse in AA}, booktitle = {The Web Conference (WWW)}, year = {2018}, abstract = {

Alcoholism, also known as Alcohol Use Disorder (AUD) is a serious problem affecting millions of people worldwide. Recovery from AUD is known to be challenging and often leads to relapse at various points after enrolling in a rehabilitation program such as Alcoholics Anonymous (AA). In this work, we take a structured approach to understand recovery and relapse from AUD using social media data. To do so, we combine linguistic and psychological attributes of users with relational features that capture useful structure in the user interaction network. We evaluate our models on AA-attending users extracted from the Twitter social network and predict recovery at two different points{\textemdash}90-days and 1 year after the user joins AA, respectively. Our experiments reveal that our structured approach is helpful in predicting recovery in these users. We perform extensive quantitative analysis of different groups of features and dependencies among them. Our analysis sheds light on the role of each feature group and how they combine to predict recovery and relapse. Finally, we present a qualitative analysis of different reasons behind users relapsing to AUD. Our models and analysis are helpful in making meaningful predictions in scenarios where only a subset of features are available and can potentially be helpful in identifying and preventing relapse early.

}, url = {https://github.com/yzhan202/zhang-www18-experiments}, author = {Zhang, Yue and Ramesh, Arti and Golbeck, Jennifer and Dhanya Sridhar and Lise Getoor} } @conference {321, title = {Aligning Product Categories using Anchor Products}, booktitle = {Workshop on Knowledge Base Construction, Reasoning and Mining (KBCOM)}, year = {2018}, abstract = {

E-commerce sites group similar products into categories, and these categories are further organized in a taxonomy. Since different sites have different products and cater to a variety of shoppers, the taxonomies differ both in the categorization of products and the textual representation used for these categories. In this paper, we propose a technique to align categories across sites, which is useful information to have in product graphs. We use breadcrumbs present on the product pages to infer a site{\textquoteright}s taxonomy. We generate a list of candidate category pairs for alignment using anchor products products present in two or more sites. We use multiple similarity and distance metrics to compare these candidates. To generate the final set of alignments, we propose a model that combines these metrics with a set of structural constraints. The model is based on probabilistic soft logic (PSL), a scalable probabilistic programming framework. We run experiments on data extracted from Amazon, Ebay, Staples and Target, and show that the distance metric based on products, and the use of PSL to combine various metrics and structural constraints lead to improved alignments.

}, author = {Varun Embar and Golnoosh Farnadi and Jay Pujara and Lise Getoor} } @conference {329, title = {Clustering System Data using Aggregate Measures}, booktitle = {Machine Learning and Systems (MLSys)}, year = {2018}, abstract = {

Many real-world systems generate a tremendous amount of data cataloging the actions, responses, and internal states. Prominent examples include user logs on web servers, instrumentation of source code, and performance statistics in large data centers. The magnitude of this data makes it impossible to log individual events, but instead requires capturing aggregate statistics at a coarser granularity, resulting in statistical distributions instead of discrete values. We survey several popular statistical distance measures and demonstrate how appropriate statistical distances can allow meaningful clustering of web log data.

}, author = {Johnnie Chang and Robert Chen and Jay Pujara and Lise Getoor} } @article {331, title = {Collective Entity Resolution in Multi-Relational Familial Networks}, journal = {Knowledge and Information Systems (KAIS)}, volume = {61}, year = {2018}, pages = {1547-{\textendash}1581}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between co-references. Exploiting these dependencies is challenging -- it requires seamlessly combining statistical, relational, and logical dependencies.\ One task of particular interest is entity resolution\ in familial networks. \ In this setting, \ multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. \  This reconstruction is crucial for applications such as understanding genetic inheritance,\ tracking disease contagion, \ and performing census surveys.\ Here, we design a model that incorporates statistical signals (such as name similarity), relational information (such as sibling overlap), logical constraints (such as transitivity and bijective matching), and predictions from other algorithms (such as logistic regression and support vector machines), in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on real-world data, our model significantly outperforms state-of-the-art classifiers that use relational features but are incapable of collective reasoning.

}, author = {Pigi Kouki and Jay Pujara and Christopher Marcum and Laura Koehly and Lise Getoor} } @conference {330, title = {Estimating Causal Effects of Exercise from Mood Logging Data}, booktitle = {ICML Workshop on Causal Machine Learning (CausalML)}, year = {2018}, abstract = {

Mood and activity logging applications empower users to monitor their daily well-being and make informed health choices. To provide users with useful feedback that can improve quality of life, a critical task is understanding the causal effects of daily activities on mood and other wellness markers. In this work, we analyze observational data from EmotiCal, a recently developed mood-logging web application, to explore the effects of exercise on mood.\ We investigate several methodological choices for estimating the conditional average treatment effect, and highlight a novel use of textual data to improve the significance of our results.

}, author = {Dhanya Sridhar and Aaron Springer and Victoria Hollis and Steve Whittaker and Lise Getoor} } @conference {322, title = {Fairness in Relational Domains}, booktitle = {Artificial Intelligence, Ethics, and Society (AIES)}, year = {2018}, abstract = {

AI and machine learning tools are being used with increasing frequency for decision making in domains that affect peoples{\textquoteright} lives such as employment, education, policing and loan approval. These uses raise concerns about biases of algorithmic discrimination and have motivated the development of fairness-aware machine learning. However, existing fairness approaches are based solely on attributes of individuals. In many cases, discrimination is much more complex, and taking into account the social, organizational, and other connections between individuals is important. We introduce new notions of fairness that are able to capture the relational structure in a domain. We use first-order logic to provide a flexible and expressive language for specifying complex relational patterns of discrimination. Furthermore, we extend an existing statistical relational learning framework, probabilistic soft logic (PSL), to incorporate our definition of relational fairness. We refer to this fairness-aware framework FairPSL. FairPSL makes use of the logical definitions of fairnesss but also supports a probabilistic interpretation. In particular, we show how to perform maximum a posteriori(MAP) inference by exploiting probabilistic dependencies within the domain while avoiding violation of fairness guarantees. Preliminary empirical evaluation shows that we are able to make both accurate and fair decisions.

}, author = {Golnoosh Farnadi and Behrouz Babaki and Lise Getoor} } @conference {318, title = {Fairness-aware Relational Learning and Inference}, booktitle = {AAAI Workshop on Declarative Learning Based Programming (DeLBP)}, year = {2018}, author = {Golnoosh Farnadi and Behrouz Babaki and Lise Getoor} } @conference {328, title = {Scalable Probabilistic Causal Structure Discovery}, booktitle = {International Joint Conference on Artificial Intelligence (IJCAI)}, year = {2018}, abstract = {

Complex causal networks underlie many real-world problems, from the regulatory interactions between genes to the environmental patterns used to understand climate change. Computational methods seek to infer these casual networks using observational data and domain knowledge. In this paper, we identify three key requirements for inferring the structure of causal networks for scientific discovery: (1) robustness to noise in observed measurements; (2) scalability to handle hundreds of variables; and (3) flexibility to encode domain knowledge and other structural constraints. We first formalize the problem of joint probabilistic causal structure discovery.\ We develop an approach using probabilistic soft logic (PSL) that exploits multiple statistical tests, supports efficient optimization over hundreds of variables, and can easily incorporate structural constraints, including imperfect domain knowledge. We compare our method against multiple well-studied approaches on biological and synthetic datasets, showing improvements of up to 20\% in F1-score over the best performing baseline in realistic settings.

}, url = {https://bitbucket.org/linqs/causpsl/src/master/}, author = {Dhanya Sridhar and Pujara, Jay and Lise Getoor} } @conference {332, title = {Scalable Structure Learning for Probabilistic Soft Logic}, booktitle = {IJCAI Workshop on Statistical Relational AI (StarAI)}, year = {2018}, month = {06/2018}, abstract = {

Statistical relational frameworks such as Markov logic networks and probabilistic soft logic (PSL) encode model structure with weighted first-order logical clauses. Learning these clauses from data is referred to as structure learning. Structure learning alleviates the manual cost of specifying models. However, this benefit comes with high computational costs; structure learning typically requires an expensive search over the space of clauses which involves repeated optimization of clause weights. In this paper, we propose the first two approaches to structure learning for PSL. We introduce a greedy search-based algorithm and a novel optimization method that trade-off scalability and approximations to the structure learning problem in varying ways. The highly scalable optimization method combines data-driven generation of clauses with a piecewise pseudolikelihood (PPLL) objective that learns model structure by optimizing clause weights only once. We compare both methods across five real-world tasks, showing that PPLL achieves an order of magnitude runtime speedup and AUC gains up to 15\% over greedy search.

}, author = {Varun Embar and Dhanya Sridhar and Golnoosh Farnadi and Lise Getoor} } @conference {335, title = {Sustainability at Scale: Bridging the Intention-Behavior Gap with Sustainable Recommendations}, booktitle = {Recommender Systems (RecSys)}, year = {2018}, abstract = {

Finding sustainable products and evaluating their claims is a significant barrier facing sustainability-minded customers. Tools that reduce both these burdens are likely to boost the sale of sustainable products. However, it is difficult to determine the sustainability characteristics of these products {\textemdash} there are a variety of certifications and definitions of sustainability, and quality labeling requires input from domain experts. In this paper, we propose a flexible probabilistic framework that uses domain knowledge to identify sustainable products and customers, and uses these labels to predict customer purchases. We evaluate our approach on grocery items from the Amazon catalog. Our proposed approach outperforms established recommender system models in predicting future purchases while jointly inferring sustainability scores for customers and products.

}, author = {Tomkins, Sabina and Isley, Steve and London, Ben and Lise Getoor} } @conference {334, title = {The Impact of Environmental Stressors on Human Trafficking}, booktitle = {ICWSM Workshop on Beyond Online Data (BOD)}, year = {2018}, abstract = {

Severe environmental events have extreme effects on all segments of society, including criminal activity. Extreme weather events, such as tropical storms, fires, and floods create instability in communities, and can be exploited by criminal organizations. Here we investigate the potential impact of catastrophic storms on the criminal activity of human trafficking. We propose three theories of how these catastrophic storms might impact trafficking and provide evidence for each. Researching human trafficking is made difficult by its illicit nature and the obscurity of high-quality data. Here, we analyze online advertisements for services which can be collected at scale and provide insights into traffickers{\textquoteright} behavior. To successfully combine relevant heterogenous sources of information, as well as spatial and temporal structure, we propose a collective, probabilistic approach. We implement this approach with Probabilistic Soft Logic, a probabilistic programming framework which can flexibly model relational structure and for which inference of future locations is highly efficient. Furthermore, this framework can be used to model hidden structure, such as latent links between locations. Our proposed approach can model and predict how traffickers move. In addition, we propose a model which learns connections between locations. This model is then adapted to have knowledge of environmental events, and we demonstrate that incorporating knowledge of environmental events can improve prediction of future locations. While we have validated our models on the impact of severe weather on human trafficking, we believe our models can be generalized to a variety of other settings in which environmental events impact human behavior.

}, author = {Tomkins, Sabina and Golnoosh Farnadi and Brian Amantullah and Lise Getoor and Steven Minton} } @conference {338, title = {The Impact of Environmental Stressors on Human Trafficking}, booktitle = {International Conference on Data Mining (ICDM)}, year = {2018}, abstract = {

{\textemdash}Severe environmental events have extreme effects on all segments of society, including criminal activity. Extreme weather events, such as tropical storms, fires, and floods create instability in communities, and can be exploited by criminal organizations. Here we investigate the potential impact of catastrophic storms on the criminal activity of human trafficking. We propose three theories of how these catastrophic storms might impact trafficking and provide evidence for each. Researching human trafficking is made difficult by its illicit nature and the obscurity of high-quality data. Here, we analyze online advertisements for services which can be collected at scale and provide insights into traffickers{\textquoteright} behavior. To successfully combine relevant heterogenous sources of information, as well as spatial and temporal structure, we propose a collective, probabilistic approach. We implement this approach with Probabilistic Soft Logic, a probabilistic programming framework which can flexibly model relational structure and for which inference of future locations is highly efficient. Furthermore, this framework can be used to model hidden structure, such as latent links between locations. Our proposed approach can model and predict how traffickers move. In addition, we propose a model which learns connections between locations. This model is then adapted to have knowledge of environmental events, and we demonstrate that incorporating knowledge of environmental events can improve prediction of future locations. While we have validated our models on the impact of severe weather on human trafficking, we believe our models can be generalized to a variety of other settings in which environmental events impact human behavior

}, author = {Tomkins, Sabina and Golnoosh Farnadi and Brian Amantullah and Lise Getoor and Steven Minton} } @conference {339, title = {Understanding Evolution of Long-running MOOCs}, booktitle = {International Conference on Web Information Systems Engineering (WISE)}, year = {2018}, author = {Arti Ramesh and Lise Getoor} } @conference {kouki:icdm17, title = {Collective Entity Resolution in Familial Networks}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, year = {2017}, note = {To Appear}, abstract = {

Entity resolution in settings with rich relational structure often introduces complex dependencies between coreferences. Exploiting these dependencies is challenging {\textendash} it requires seamlessly combining statistical, relational, and logical dependencies. One task of particular interest is entity resolution in familial networks. In this setting, multiple partial representations of a family tree are provided, from the perspective of different family members, and the challenge is to reconstruct a family tree from these multiple, noisy, partial views. This reconstruction is crucial for applications such as understanding genetic inheritance, tracking disease contagion, and performing census surveys. Here, we design a model that incorporates statistical signals, such as name similarity, relational information, such as sibling overlap, and logical constraints, such as transitivity and bijective matching, in a collective model. We show how to integrate these features using probabilistic soft logic, a scalable probabilistic programming framework. In experiments on realworld data, our model significantly outperforms state-of-theart classifiers that use relational features but are incapable of collective reasoning. I

}, url = {https://github.com/pkouki/icdm2017}, author = {Kouki, Pigi and Pujara, Jay and Marcum, Christopher and Koehly, Laura and Lise Getoor} } @conference {kimmig:icde17, title = {A Collective, Probabilistic Approach to Schema Mapping}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2017}, url = {https://github.com/alexmemory/kimmig-icde17/wiki}, author = {Kimmig, Angelika and Memory, Alex and Miller, Renee and Lise Getoor} } @conference {tomkins:lld2017, title = {Detecting Cyber-bullying from Sparse Data and Inconsistent Labels}, booktitle = {Learning with Limited Labeled Data (LLD) NIPS Workshop}, year = {2017}, author = {Tomkins, Sabina and Lise Getoor and Chen, Yunfei and Zhang, Yi} } @conference {tomkins:ijcai17, title = {Disambiguating Energy Disaggregation: A Collective Probabilistic Approach}, booktitle = {International Joint Conference on Artifi cial Intelligence}, year = {2017}, author = {Tomkins, Sabina and Pujara, Jay and Lise Getoor} } @article {bac:jmlr17, title = {Hinge-Loss Markov Random Fields and Probabilistic Soft Logic}, journal = {Journal of Machine Learning Research (JMLR)}, volume = {18}, year = {2017}, pages = {1-67}, abstract = {

A fundamental challenge in developing high-impact machine learning technologies is balancing the need to model rich, structured domains with the ability to scale to big data. Many important problem areas are both richly structured and large scale, from social and biological networks, to knowledge graphs and the Web, to images, video, and natural language. In this paper, we introduce two new formalisms for modeling structured data, and show that they can both capture rich structure and scale to big data. The first, hingeloss Markov random fields (HL-MRFs), is a new kind of probabilistic graphical model that generalizes different approaches to convex inference. We unite three approaches from the randomized algorithms, probabilistic graphical models, and fuzzy logic communities, showing that all three lead to the same inference objective. We then define HL-MRFs by generalizing this unified objective. The second new formalism, probabilistic soft logic (PSL), is a probabilistic programming language that makes HL-MRFs easy to define using a syntax based on first-order logic. We introduce an algorithm for inferring most-probable variable assignments (MAP inference) that is much more scalable than general-purpose convex optimization methods, because it uses message passing to take advantage of sparse dependency structures. We then show how to learn the parameters of HL-MRFs. The learned HL-MRFs are as accurate as analogous discrete models, but much more scalable. Together, these algorithms enable HL-MRFs and PSL to model rich, structured data at scales not previously possible.

}, url = {https://github.com/stephenbach/bach-jmlr17-code}, author = {Bach, Stephen H. and Broecheler, Matthias and Huang, Bert and Lise Getoor} } @conference {ramesh:wi17, title = {Multi-relational influence models for online professional networks}, booktitle = {International Conference on Web Intelligence (ICWI)}, year = {2017}, pages = {291-298}, publisher = {ACM}, organization = {ACM}, abstract = {

Professional networks are a specialized class of social networks that are particularly aimed at forming and strengthening professional connections and have become a vital component of professional success and growth. In this paper, we present a holistic model to jointly represent different heterogenous relationships between pairs of individuals, user actions and their respective propagations to characterize influence in online professional networks. Previous work on influence in social networks typically only consider a single action type in characterizing influence. Our model is capable of representing and combining different kinds of information users assimilate in the network and compute pairwise values of influence taking the different types of actions into account. We evaluate our models on data from the largest professional network, LinkedIn and show the effectiveness of the inferred influence scores in predicting user actions. We further demonstrate that modeling different user actions, node features, and edge relationships between users leads to around 20\% increase in precision at top k in predicting user actions, when compared to the current state-of-the-art model.

}, author = {Ramesh, Arti and Rodriguez, Mario and Lise Getoor} } @conference {kim:www17, title = {Probabilistic Visitor Stitching on Cross-Device Web Logs}, booktitle = {International Conference on World Wide Web (WWW)}, year = {2017}, pages = {1581{\textendash}1589}, author = {Kim, Sungchul and Kini, Nikhil and Pujara, Jay and Koh, Eunyee and Lise Getoor} } @article {farnadi:mlj17, title = {Soft quantification in statistical relational learning}, journal = {Machine Learning Journal}, year = {2017}, author = {Golnoosh Farnadi and Bach, Stephen H. and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @conference {pujara:emnlp17, title = {Sparsity and Noise: Where Knowledge Graph Embeddings Fall Short}, booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, year = {2017}, url = {https://github.com/eriq-augustine/meta-kg}, author = {Pujara, Jay and Eriq Augustine and Lise Getoor} } @conference {kouki:recsys17, title = {User Preferences for Hybrid Explanations}, booktitle = {11th ACM Conference on Recommender Systems (RecSys)}, year = {2017}, author = {Kouki, Pigi and Schaffer, James and Pujara, Jay and ODonovan, John and Lise Getoor} } @conference {sridhar:akbc17, title = {Using Noisy Extractions to Discover Causal Knowledge}, booktitle = {NIPS Workshop on Automated Knowledge Base Construction}, year = {2017}, author = {Dhanya Sridhar and Pujara, Jay and Lise Getoor} } @article {sridhar:bioinformatics16, title = {A Probabilistic Approach for Collective Similarity-based Drug-Drug Interaction Prediction}, journal = {Bioinformatics}, volume = {32}, year = {2016}, chapter = {3175--3182}, abstract = {

MOTIVATION: As concurrent use of multiple medications becomes ubiquitous among patients, it is crucial to characterize both adverse and synergistic interactions between drugs. Statistical methods for prediction of putative drug-drug interactions (DDIs) can guide in vitro testing and cut down significant cost and effort. With the abundance of experimental data characterizing drugs and their associated targets, such methods must effectively fuse multiple sources of information and perform inference over the network of drugs.

RESULTS: We propose a probabilistic approach for jointly inferring unknown DDIs from a network of multiple drug-based similarities and known interactions. We use the highly scalable and easily extensible probabilistic programming framework Probabilistic Soft Logic We compare against two methods including a state-of-the-art DDI prediction system across three experiments and show best performing improvements of more than 50\% in AUPR over both baselines. We find five novel interactions validated by external sources among the top-ranked predictions of our model.

AVAILABILITY AND IMPLEMENTATION: Final versions of all datasets and implementations will be made publicly available.

CONTACT: dsridhar@ucsc.edu.

}, author = {Dhanya Sridhar and Shobeir Fakhraei and Lise Getoor} } @conference {fakhraei:mlg16, title = {Adaptive Neighborhood Graph Construction for Inference in Multi-Relational Networks}, booktitle = {KDD}, year = {2016}, month = {2016}, publisher = {ACM SIGKDD}, organization = {ACM SIGKDD}, abstract = {

A neighborhood graph, which represents the instances as vertices and their relations as weighted edges, is the basis of many semi-supervised and relational models for node labeling and link prediction. Most methods employ a sequential process to construct the neighborhood graph. This process often consists of generating a candidate graph, pruning the candidate graph to make a neighborhood graph, and then performing inference on the variables (i.e., nodes) in the neighborhood graph. In this paper, we propose a framework that can dynamically adapt the neighborhood graph based on the states of variables from intermediate inference results, as well as structural properties of the relations connecting them. A key strength of our framework is its ability to handle multi-relational data and employ varying amounts of relations for each instance based on the intermediate inference results. We formulate the link prediction task as inference on neighborhood graphs, and include preliminary results illustrating the effects of different strategies in our proposed framework.

}, author = {Shobeir Fakhraei and Sridhar Dhanya and Jay Pujara and Lise Getoor} } @article {muthiah:aimag16, title = {Capturing Planned Protests from Open Source Indicators}, journal = {AI Mag}, volume = {37}, number = {2}, year = {2016}, pages = {63{\textendash}75}, abstract = {

Civil unrest events (protests, strikes, and {\textquotedblleft}occupy{\textquotedblright} events) are common occurrences in both democracies and authoritarian regimes. The study of civil unrest is a key topic for political scientists as it helps capture an important mechanism by which citizenry express themselves. In countries where civil unrest is lawful, qualitative analysis has revealed that more than 75 percent of the protests are planned, organized, or announced in advance; therefore detecting references to future planned events in relevant news and social media is a direct way to develop a protest forecasting system. We report on a system for doing that in this article. It uses a combination of keyphrase learning to identify what to look for, probabilistic soft logic to reason about location occurrences in extracted results, and time normalization to resolve future time mentions. We illustrate the application of our system to 10 countries in Latin America: Argentina, Brazil, Chile, Colombia, Ecuador, El Salvador, Mexico, Paraguay, Uruguay, and Venezuela. Results demonstrate our successes in capturing significant societal unrest in these countries with an average lead time of 4.08 days. We also study the selective superiorities of news media versus social media (Twitter, Facebook) to identify relevant trade-offs.

}, author = {Sathappan Muthiah and Bert Huang and Jaime Arredondo and David Mares and Lise Getoor and Graham Katz and Naren Ramakrishnan} } @conference {kouki:mlg16, title = {Entity Resolution in Familial Networks}, booktitle = {MLG}, year = {2016}, month = {2016}, abstract = {

Entity resolution is an important graph mining problem. Entity resolution is particularly interesting and challenging when there is rich relational structure. In this paper, we study the problem of performing entity resolution in familial networks. In our setting, we are given partial views of a familial network as described from\  \  the point of view of different people in the network and our goal is to reconstruct the underlying familial network from these perspective partial views. The data and relations provided may be inaccurate, missing or incomplete. In our approach, we start by augmenting the known set of familial relations with additional ones that are either inversed or derived from the original set of relations by linkage heuristics. Additionally, we propose a set of measures that capture the similarity of persons in the familial network based on both personal and relational information. We present a supervised learning approach where we view entity resolution in familial networks as a classification problem. Our experiments on real-world data from multiple-informant pedigrees show that our approach works well and that we can improve performance by considering separate similarity scores for each relation type.

}, author = {Pigi Kouki and Christopher Marcum and Laura Koehly and Lise Getoor} } @conference {pujara:starai16, title = {Generic Statistical Relational Entity Resolution in Knowledge Graphs}, booktitle = {StarAI}, year = {2016}, note = {On arXiv: https://arxiv.org/abs/1607.00992}, publisher = {IJCAI 2016}, organization = {IJCAI 2016}, abstract = {

Entity resolution, the problem of identifying the underlying entity of references found in data, has been researched for many decades in many communities. A common theme in this research has been the importance of incorporating relational features into the resolution process. Relational entity resolution is particularly important in knowledge graphs (KGs), which have a regular structure capturing entities and their interrelationships. We identify three major problems in KG entity resolution: (1) intra-KG reference ambiguity; (2) inter-KG reference ambiguity; and (3) ambiguity when extending KGs with new facts. We implement a framework that generalizes across these three settings and exploits this regular structure of KGs. Our framework has many advantages over custom solutions widely deployed in industry, including collective inference, scalability, and interpretability. We apply our framework to two real-world KG entity resolution problems, ambiguity in NELL and merging data from Freebase and MusicBrainz, demonstrating the importance of relational features.

}, doi = {2016}, author = {Jay Pujara and Lise Getoor} } @conference {sridhar:kddws16, title = {Joint Probabilistic Inference of Causal Structure}, booktitle = {KDD Workshop on CD}, year = {2016}, abstract = {

Causal directed acyclic graphical models (DAGs) are powerful reasoning tools in the study and estimation of cause and effect in scientific and socio-behavioral phenomena. In many domains where the cause and effect structure is unknown, a key challenge in studying causality with DAGs is learning the structure of causal graphs directly from observational data. Traditional approaches to causal structure discovery are categorized as constraint-based or score-based approaches. Score-based methods perform greedy search over the space of models whereas constraint-based methods iteratively prune and orient edges using structural and statistical constraints. However, both types of approaches rely on heuristics that introduce false positives and negatives. In our work, we cast causal structure discovery as an inference problem and propose a joint probabilistic approach for optimizing over model structures. We use a recently introduced and highly efficient probabilistic programming framework known as Probabilistic Soft Logic (PSL) to encode constraint-based structure search. With this novel probabilistic approach to structure discovery, we leverage multiple independence tests and avoid early pruning and variable ordering. We compare our method to the notable PC algorithm on a well-studied synthetic dataset and show improvements in accuracy of predicting causal edges.

}, author = {Dhanya Sridhar and Lise Getoor} } @conference {tomkins:edm16, title = {Predicting Post-Test Performance from Online Student Behavior: A High School MOOC Case Study}, booktitle = {EDM}, year = {2016}, abstract = {

With the success and proliferation of Massive Open Online Courses (MOOCs) for college curricula, there is demand for adapting this modern mode of education for high school courses. Online and open courses have the potential to fill a much needed gap in high school curricula, especially in fields such as computer science, where there is shortage of trained teachers nationwide. In this paper, we analyze student post-test performance to determine the success of a high school computer science MOOC. We empirically characterize student success by using students{\textquoteright} performance on the Advanced Placement (AP) exam, which we treat as a post test. This post-test performance is more indicative of long-term learning than course performance, and allows us to model the extent to which students have internalized course material. Additionally, we analyze and compare the performance of a subset of students who received in-person coaching at their high school, to those students who took the course independently. This comparison provides better understanding of the role of a teacher in a student{\textquoteright}s learning. We build a predictive machine learning model, and use it to identify the key factors contributing to the success of online high school courses. Our analysis demonstrates that high schoolers can thrive in MOOCs.

}, keywords = {high school MOOCs, online education, student learning}, author = {Sabina Tomkins and Arti Ramesh and Lise Getoor} } @conference {sridhar:uaiws16, title = {Probabilistic Inference for Causal Structure Discovery}, booktitle = {UAI Workshop on Causation}, year = {2016}, author = {Dhanya Sridhar and Lise Getoor} } @conference {rekatsinas:sigmod16, title = {SourceSight: Enabling Effective Source Selection}, booktitle = {SIGMOD}, year = {2016}, abstract = {

Recently there has been a rapid increase in the number of data sources and data services, such as cloud-based data markets and data portals, that facilitate the collection, publishing and trading of data. Data sources typically exhibit large heterogeneity in the type and quality of data they provide. Unfortunately, when the number of data sources is large, it is difficult for users to reason about the actual usefulness of sources for their applications and the trade-offs between the benefits and costs of acquiring and integrating sources. In this demonstration we present SOURCESIGHT, a system that allows users to interactively explore a large number of heterogeneous data sources, and discover valuable sets of sources for diverse integration tasks. SOURCESIGHT uses a novel multi-level source quality index that enables effective source selection at different granularity levels, and introduces a collection of new techniques to discover and evaluate relevant sources for integration.

}, author = {Theodoros Rekatsinas and Amol Deshpande and Luna Dong and Lise Getoor and Divesh Srivastava} } @article {london:jmlr16, title = {Stability and Generalization in Structured Prediction}, journal = {Journal of Machine Learning Research}, volume = {17}, year = {2016}, note = {to appear}, chapter = {1--52}, abstract = {

Structured prediction models have been found to learn effectively from a few large examples\ {\textemdash} sometimes even just one. Despite empirical evidence, canonical learning theory cannot guarantee generalization in this setting because the error bounds decrease as a function of the number of examples. We therefore propose new PAC-Bayesian generalization bounds for structured prediction that decrease as a function of both the number of examples and the size of each example. Our analysis hinges on the stability of joint inference and the smoothness of the data distribution. We apply our bounds to several common learning scenarios, including max-margin and soft-max training of Markov random fields. Under certain conditions, the resulting error bounds can be far more optimistic than previous results and can even guarantee generalization from a single large example.

}, keywords = {PAC-Bayes, generalization bounds, learning theory, structured prediction}, author = {Ben London and Bert Huang and Lise Getoor} } @conference {kumar:asonam16, title = {Unsupervised Models for Predicting Strategic Relations between Organizations}, booktitle = {ASONAM}, year = {2016}, abstract = {

Microblogging sites like Twitter provide a platform for sharing ideas and expressing opinions. The widespread popularity of these platforms and the complex social structure that arises within these communities provides a unique opportunity to understand the interactions between users. The political domain, especially in a multi-party system, presents compelling challenges, as political parties have different levels of alignment based on their political strategies. We use Twitter to understand the nuanced relationships between differing political entities in Latin America. Our model incorporates diverse signals from the content of tweets and social context from retweets, mentions and hashtag usage. Since direct communications between entities are relatively rare, we explore models based on the posts of users who interact with multiple political organizations. We present a quantitative and qualitative analysis of the results of models using different features, and demonstrate that a model capable of using sentiment strength, social context, and issue alignment has superior performance to less sophisticated baselines.

}, author = {Shachi Kumar and Jay Pujara and Lise Getoor and David Mares and Dipak Gupta and Ellen Riloff} } @conference {pujara:uai15, title = {Budgeted Online Collective Inference}, booktitle = {UAI}, year = {2015}, abstract = {

Updating inference in response to new evidence is a fundamental challenge in artificial intelligence. Many real problems require large probabilistic graphical models, containing millions of interdependent variables. For such large models, jointly updating the most likely (i.e., MAP) configuration of the variables each time new evidence is encountered can be infeasible, even if inference is tractable. In this paper, we introduce budgeted online collective inference, in which the MAP configuration of a graphical model is updated efficiently by revising the assignments to a subset of the variables while holding others fixed. The goal is to selectively update certain variables without sacrificing quality with respect to full inference. To formalize the consequences of partially updating inference, we introduce the concept of inference regret. We derive inference regret bounds for a class of graphical models with strongly-convex free energies. These theoretical insights, combined with a thorough analysis of the optimization solver, motivate new approximate methods for efficiently updating the variable assignments under a budget constraint. In experiments, we demonstrate that our algorithms can reduce inference time by 65\% with accuracy comparable to full inference.

}, author = {Jay Pujara and Ben London and Lise Getoor} } @article {namata:tkdd15, title = {Collective Graph Identification}, journal = {TKDD}, volume = {10}, number = {3}, year = {2015}, chapter = {1--36}, abstract = {

Data describing networks{\textemdash}such as communication networks, transaction networks, disease transmission networks, collaboration networks, etc.{\textemdash}are becoming increasingly available. While observational data can be useful, it often only hints at the actual underlying process that governs interactions and attributes. For example, an email communication network provides insight into its users and their relationships, but is not the same as the {\textquotedblleft}real{\textquotedblright} underlying social network. In this article, we introduce the problem of graph identification, i.e., discovering the latent graph structure underlying an observed network. We cast the problem as a probabilistic inference task, in which we must infer the nodes, edges, and node labels of a hidden graph, based on evidence. This entails solving several canonical problems in network analysis: entity resolution (determining when two observations correspond to the same entity), link prediction (inferring the existence of links), and node labeling (inferring hidden attributes). While each of these subproblems has been well studied in isolation, here we consider them as a single, collective task. We present a simple, yet novel, approach to address all three subproblems simultaneously. Our approach, which we refer to as C3, consists of a collection of Coupled Collective Classifiers that are applied iteratively to propagate inferred information among the subproblems. We consider variants of C3 using different learning and inference techniques and empirically demonstrate that C3 is superior, both in terms of predictive accuracy and running time, to state-of-the-art probabilistic approaches on four real problems.

}, author = {Galileo Namata and Ben London and Lise Getoor} } @conference {fakhraei:kdd15, title = {Collective Spammer Detection in Evolving Multi-Relational Social Networks}, booktitle = {KDD}, year = {2015}, note = {Data and Code: https://github.com/shobeir/fakhraei_kdd2015}, abstract = {

Detecting unsolicited content and the spammers who create it is a long-standing challenge that affects all of us on a daily basis. The recent growth of richly-structured social networks has provided new challenges and opportunities in the spam detection landscape. Motivated by the Tagged.com social network, we develop methods to identify spammers in evolving multi-relational social networks. We model a social network as a time-stamped multi-relational graph where vertices represent users, and edges represent different activities between them. To identify spammer accounts, our approach makes use of structural features, sequence modelling, and collective reasoning. We leverage relational sequence information using k-gram features and probabilistic modelling with a mixture of Markov models. Furthermore, in order to perform collective reasoning and improve the predictive power of a noisy abuse reporting system, we develop a statistical relational model using hinge-loss Markov random fields (HL-MRFs), a class of probabilistic graphical models which are highly scalable. We use Graphlab Create and Probabilistic Soft Logic (PSL) to prototype and experimentally evaluate our solutions on internet-scale data from Tagged.com. Our experiments demonstrate the effectiveness of our approach, and show that models which incorporate the multi-relational nature of the social network significantly gain predictive performance over those that do not.

}, author = {Shobeir Fakhraei and James Foulds and Madhusudana Shashanka and Lise Getoor} } @book {fakhraei:book15, title = {Data Analytics for Pharmaceutical Discoveries}, series = {Healthcare Data Analytics}, volume = {1}, year = {2015}, pages = {1--25}, publisher = {CRC Press}, organization = {CRC Press}, edition = {1}, chapter = {1}, author = {Shobeir Fakhraei and Eberechukwu Onukwugha and Lise Getoor} } @conference {rekatsinas:cidr15, title = {Finding Quality in Quantity: The Challenge of Discovering Valuable Sources for Integration}, booktitle = {7th Biennial Conference on Innovative Data Systems Research (CIDR {\textquoteleft}15)}, year = {2015}, author = {Rekatsinas, Theodoros and Dong, Xin Luna and Lise Getoor and Srivastava, Divesh} } @article {rekatsinas:sam2016, title = {Forecasting Rare Disease Outbreaks Using Multiple Data Sources}, journal = {STAT ANAL DATA MIN}, year = {2015}, note = {Best of SDM 2015, Special Issue}, chapter = {379}, abstract = {

Rapidly increasing volumes of news feeds from diverse data sources, such as online newspapers, Twitter and online blogs are proving to be extremely valuable resources in helping anticipate, detect, and forecast outbreaks of rare diseases. This paper presents SourceSeer, a novel algorithmic framework that combines spatio-temporal topic models with sourcebased anomaly detection techniques to effectively forecast the emergence and progression of infectious rare diseases. SourceSeer is capable of discovering the location focus of each source allowing sources to be used as experts with varying degrees of authoritativeness. To fuse the individual source predictions into a final outbreak prediction we employ a multiplicative weights algorithm taking into account the accuracy of each source. We evaluate the performance of SourceSeer using incidence data for hantavirus syndromes in multiple countries of Latin America provided by HealthMap over a timespan of fifteen months. We demonstrate that SourceSeer makes predictions of increased accuracy compared to several baselines and is capable of forecasting disease outbreaks in a timely manner even when no outbreaks were previously reported.

}, author = {Theodoros Rekatsinas and Saurav Ghosh and Sumiko Mekaru and Elaine Nsoesie and John Brownstein and Lise Getoor and Naren Ramakrishnan} } @conference {he:icml15, title = {HawkesTopic: A Joint Model for Network Inference and Topic Modeling from Text-Based Cascades}, booktitle = {International Conference on Machine Learning}, year = {2015}, author = {He, Xinran and Rekatsinas, Theodoros and Foulds, James and Lise Getoor and Liu, Yan} } @article {bach:arxiv15, title = {Hinge-Loss Markov Random Fields and Probabilistic Soft Logic}, journal = {ArXiv:1505.04406 [cs.LG]}, year = {2015}, note = {To reference this work, please cite the JMLR paper.}, author = {Bach, Stephen H. and Broecheler, Matthias and Huang, Bert and Lise Getoor} } @conference {kouki:recsys15, title = {HyPER: A Flexible and Extensible Probabilistic Framework for Hybrid Recommender Systems}, booktitle = {9th ACM Conference on Recommender Systems (RecSys)}, year = {2015}, publisher = {ACM}, organization = {ACM}, author = {Kouki, Pigi and Fakhraei, Shobeir and Foulds, James and Eirinaki, Magdalini and Lise Getoor} } @conference {sridhar:acl15, title = {Joint Models of Disagreement and Stance in Online Debate}, booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)}, year = {2015}, author = {Dhanya Sridhar and Foulds, James and Walker, Marilyn and Huang, Bert and Lise Getoor} } @conference {foulds:icml15, title = {Latent Topic Networks: A Versatile Probabilistic Programming Framework for Topic Models}, booktitle = {International Conference on Machine Learning (ICML)}, year = {2015}, abstract = {

Topic models have become increasingly prominent text-analytic machine learning tools for research in the social sciences and the humanities. In particular, custom topic models can be developed to answer specific research questions. The design of these models requires a nontrivial amount of effort and expertise, motivating general-purpose topic modeling frameworks. In this paper we introduce latent topic networks, a flexible class of richly structured topic models designed to facilitate applied research. Custom models can straightforwardly be developed in our framework with an intuitive first-order logical probabilistic programming language. Latent topic networks admit scalable training via a parallelizable EM algorithm which leverages ADMM in the M-step. We demonstrate the broad applicability of the models with case studies on modeling influence in citation networks, and U.S. Presidential State of the Union addresses.

}, author = {Foulds, James and Kumar, Shachi and Lise Getoor} } @article {kimmig:mlj15, title = {Lifted graphical models: a survey}, journal = {Machine Learning Journal}, volume = {99}, number = {1}, year = {2015}, pages = {1{\textendash}45}, author = {Kimmig, Angelika and Mihalkova, Lilyana and Lise Getoor} } @conference {pujara:starai15, title = {Online Inference for Knowledge Graph Construction.}, booktitle = {Workshop on Statistical Relational AI}, year = {2015}, author = {Pujara, Jay and London, Ben and Lise Getoor and Cohen, William} } @conference {bach:icml15, title = {Paired-Dual Learning for Fast Training of Latent Variable Hinge-Loss MRFs}, booktitle = {International Conference on Machine Learning (ICML)}, year = {2015}, note = {Stephen Bach and Bert Huang contributed equally.}, abstract = {

Latent variables allow probabilistic graphical models to capture nuance and structure in important domains such as network science, natural language processing, and computer vision. Naive approaches to learning such complex models can be prohibitively expensive{\textemdash}because they require repeated inferences to update beliefs about latent variables{\textemdash}so lifting this restriction for useful classes of models is an important problem. Hinge-loss Markov random fields (HL-MRFs) are graphical models that allow highly scalable inference and learning in structured domains, in part by representing structured problems with continuous variables. However, this representation leads to challenges when learning with latent variables. We introduce paired-dual learning, a framework that greatly speeds up training by using tractable entropy surrogates and avoiding repeated inferences. Paired-dual learning optimizes an objective with a pair of dual inference problems. This allows fast, joint optimization of parameters and dual variables. We evaluate on social-group detection, trust prediction in social networks, and image reconstruction, finding that paired-dual learning trains models as accurate as those trained by traditional methods in much less time, often before traditional methods make even a single parameter update.

}, author = {Bach, Stephen H. and Huang, Bert and Boyd-Graber, Jordan and Lise Getoor} } @conference {grycner:emnlp15, title = {RELLY: Inferring Hypernym Relationships Between Relational Phrases}, booktitle = {Conference on Empirical Methods in Natural Language Processing}, year = {2015}, author = {Grycner, Adam and Weikum, Gerhard and Pujara, Jay and Foulds, James and Lise Getoor} } @conference {rekatsinas:sdm15, title = {SourceSeer: Forecasting Rare Disease Outbreaks Using Multiple Data Sources}, booktitle = {2015 SIAM International Conference on Data Mining (SDM15)}, year = {2015}, note = {Best Research Paper Award}, publisher = {SIAM}, organization = {SIAM}, author = {Rekatsinas, Theodoros and Ghosh, Saurav and Mekaru, Sumiko and Nsoesie, Elaine and Brownstein, John and Lise Getoor and Ramakrishnan, Naren} } @article {london:stability15, title = {Stability and Generalization in Structured Prediction}, journal = {{\textendash}}, year = {2015}, note = {preprint}, keywords = {PAC-Bayes, generalization bounds, learning theory, structured prediction}, author = {London, Ben and Huang, Bert and Lise Getoor} } @conference {farnadi:ilp15, title = {Statistical Relational Learning with Soft Quantifiers}, booktitle = {International Conference on Inductive Logic Programming (ILP)}, year = {2015}, note = {Winner of Best Student Paper award.}, author = {Golnoosh Farnadi and Bach, Stephen H. and Blondeel, Marjon and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @conference {london:icml15, title = {The Benefits of Learning with Strongly Convex Approximate Inference}, booktitle = {ICML}, year = {2015}, abstract = {

We explore the benefits of strongly convex free energies in variational inference, providing both theoretical motivation and a new meta-algorithm. Using the duality between strong convexity and stability, we prove a high-probability bound on the error of learned marginals that is inversely proportional to the modulus of convexity of the free energy, thereby motivating free energies whose moduli are constant with respect to the size of the graph. We identify sufficient conditions for Ω(1)-strong convexity in two popular variational techniques: tree-reweighted and counting number entropies. Our insights for the latter suggest a novel counting number optimization framework, which guarantees strong convexity for any given modulus. Our experiments demonstrate that learning with a strongly convex free energy, using our optimization framework to guarantee a given modulus, results in substantially more accurate marginal probabilities, thereby validating our theoretical claims and the effectiveness of our framework.

}, author = {Ben London and Bert Huang and Lise Getoor} } @conference {ramesh:nipsws15, title = {Understanding Influence in Online Professional Networks}, booktitle = {NIPS Workshop on Networks in Social and Information Sciences}, year = {2015}, keywords = {HL-MRFs, influence, professional networks, social networks}, author = {Ramesh, Arti and Rodriguez, Mario and Lise Getoor} } @conference {bach:aistats15, title = {Unifying Local Consistency and MAX SAT Relaxations for Scalable Inference with Rounding Guarantees}, booktitle = {Artificial Intelligence and Statistics (AISTATS)}, year = {2015}, author = {Bach, Stephen H. and Huang, Bert and Lise Getoor} } @article {pujara:aimag15, title = {Using Semantics \& Statistics to Turn Data into Knowledge}, journal = {AI Magazine}, volume = {36}, number = {1}, year = {2015}, pages = {65{\textendash}74}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {ramesh:acl15, title = {Weakly Supervised Models of Aspect-Sentiment for Online Course Discussion Forums}, booktitle = {53rd Annual Meeting of the Association for Computational Linguistics (ACL)}, year = {2015}, keywords = {Discussion Forums, HL-MRFs, MOOCs, Online Courses, SRL}, author = {Ramesh, Arti and Kumar, Shachi and Foulds, James and Lise Getoor} } @conference {grycner:akbc2014, title = {A Unified Probabilistic Approach for Semantic Clustering of Relational Phrases}, booktitle = {NeurIPS}, year = {2014}, abstract = {

The task of finding synonymous relational phrases is important in natural language understanding problems such as question answering and paraphrase detection. While this task has been addressed by many previous systems, each of these existing approaches is limited either in expressivity or in scalability. To address this challenge, we present a large-scale statistical relational method for clustering relational phrases using Probabilistic Soft Logic (PSL) [1]. To assess the quality of our approach, we evaluated it relative to a set of baseline methods. The proposed technique was found to outperform the baselines for both clustering and link prediction, and was shown to be scalable enough to be applied to 200,000relational phrases.

}, author = {Adam Grycner and Gerhard Weikum and Jay Pujara and James Foulds and Lise Getoor} } @conference {ramakrishnan:kdd14, title = {{\textquoteleft}Beating the news{\textquoteright} with EMBERS: Forecasting Civil Unrest using Open Source Indicators}, booktitle = {ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, year = {2014}, abstract = {

We describe the design, implementation, and evaluation of EMBERS, an automated, 24x7 continuous system for forecasting civil unrest across 10 countries of Latin America using open source indicators such as tweets, news sources, blogs, economic indicators, and other data sources. Unlike retrospective studies, EMBERS has been making forecasts into the future since Nov 2012 which have been (and continue to be) evaluated by an independent T\&E team (MITRE). Of note, EMBERS has successfully forecast the uptick and downtick of incidents during the June 2013 protests in Brazil. We outline the system architecture of EMBERS, individual models that leverage specific data sources, and a fusion and suppression engine that supports trading off specific evaluation criteria. EMBERS also provides an audit trail interface that enables the investigation of why specific predictions were made along with the data utilized for forecasting. Through numerous evaluations, we demonstrate the superiority of EMBERS over baserate methods and its capability to forecast significant societal happenings.\ 

}, author = {Ramakrishnan, Naren and Butler, Patrick and Self, Nathan and Khandpur, Rupinder and Saraf, Parang and Wang, Wei and Cadena, Jose and Vullikanti, Anil and Korkmaz, Gizem and Kuhlman, Christopher and Marathe, Achla and Zhao, Liang and Ting, Hua and Huang, Bert and Srinivasan, Aravind and Trinh, Khoa and Lise Getoor and Katz, Graham and Doyle, Andy and Ackermann, Chris and Zavorin, Ilya and Ford, Jim and Summers, Kristin and Fayed, Youssef and Arredondo, Jaime and Gupta, Dipak and Mares, David} } @conference {pujara:akbc14, title = {Building Dynamic Knowledge Graphs}, booktitle = {NIPS Workshop on Automated Knowledge Base Construction}, year = {2014}, author = {Pujara, Jay and Lise Getoor} } @conference {3, title = {Collective Stance Classification of Posts in Online Debate Forums}, booktitle = {ACL Joint Workshop on Social Dynamics and Personal Attributes in Social Media}, year = {2014}, author = {Dhanya Sridhar and Lise Getoor and Walker, Marilyn} } @conference {sridhar:baylearn14, title = {Collective classification of stance and disagreement in online debate forums}, booktitle = {Bay Area Machine Learning Symposium (BayLearn)}, year = {2014}, author = {Dhanya Sridhar and Foulds, James and Huang, Bert and Walker, Marilyn and Lise Getoor} } @conference {farnadi:starai14, title = {Extending PSL with Fuzzy Quantifiers}, booktitle = {International Workshop on Statistical Relational Artificial Intelligence (StaRAI)}, year = {2014}, author = {Golnoosh Farnadi and Bach, Stephen H. and Moens, Marie-Francine and Lise Getoor and De Cock, Martine} } @conference {ramesh:aaai14, title = {Learning Latent Engagement Patterns of Students in Online Courses}, booktitle = {Proceedings of the Twenty-Eighth AAAI Conference on Artificial Intelligence}, year = {2014}, publisher = {AAAI Press}, organization = {AAAI Press}, keywords = {MOOC, learner engagement, probabilistic modeling, structured prediction}, author = {Ramesh, Arti and Goldwasser, Dan and Huang, Bert and Daume III, Hal and Lise Getoor} } @article {kimmig:machinelearning2004, title = {Lifted graphical models: a survey}, journal = {Machine Learning}, year = {2014}, pages = {1-45}, keywords = {First-order probabilistic models, Lifted inference and learning, Par-factor graphs, Probabilistic programming, Statistical relational learning, Templated graphical models}, author = {Kimmig, Angelika and Mihalkova, Lilyana and Lise Getoor} } @article {fakhraei:tcbb14, title = {Network-Based Drug-Target Interaction Prediction with Probabilistic Soft Logic}, journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics}, year = {2014}, note = {Code and data: https://github.com/shobeir/fakhraei_tcbb2014}, author = {Fakhraei, Shobeir and Huang, Bert and Raschid, Louiqa and Lise Getoor} } @conference {london:aistats14, title = {PAC-Bayesian Collective Stability}, booktitle = {Proceedings of the 17th International Conference on Artificial Intelligence and Statistics}, year = {2014}, author = {London, Ben and Huang, Bert and Benjamin Taskar and Lise Getoor} } @conference {bach:dssg14, title = {Probabilistic Soft Logic for Social Good}, booktitle = {KDD Workshop on Data Science for Social Good}, year = {2014}, author = {Bach, Stephen H. and Huang, Bert and Lise Getoor} } @conference {bach:discml14, title = {Rounding Guarantees for Message-Passing MAP Inference with Logical Dependencies}, booktitle = {NIPS Workshop on Discrete and Combinatorial Problems in Machine Learning (DISCML)}, year = {2014}, author = {Bach, Stephen H. and Huang, Bert and Lise Getoor} } @conference {london:nips14ws, title = {On the Strong Convexity of Variational Inference}, booktitle = {NIPS Workshop on Advances in Variational Inference}, year = {2014}, author = {London, Ben and Huang, Bert and Lise Getoor} } @conference {moustafa:icde14, title = {Subgraph Pattern Matching over Uncertain Graphs with Identity Linkage Uncertainty}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2014}, author = {Moustafa, Walaa Eldin and Kimmig, Angelika and Deshpande, Amol and Lise Getoor} } @article {skaggs:tois2014, title = {Topic Modeling for Wikipedia Link Disambiguation}, journal = {ACM Transactions on Information Systems}, volume = {32}, number = {3}, year = {2014}, author = {Bradley , Skaggs and Lise Getoor} } @conference {ramesh:las13, title = {Uncovering Hidden Engagement Patterns for Predicting Learner Performance in MOOCs}, booktitle = {ACM Conference on Learning at Scale}, series = {Annual Conference Series}, year = {2014}, publisher = {ACM}, organization = {ACM}, keywords = {MOOC, learner engagement, learning analytics, online education, probabilistic modeling, structured prediction}, author = {Ramesh, Arti and Goldwasser, Dan and Huang, Bert and Daume III, Hal and Lise Getoor} } @conference {ramesh:aclws14, title = {Understanding MOOC Discussion Forums using Seeded LDA}, booktitle = {ACL Workshop on Innovative Use of NLP for Building Educational Applications}, year = {2014}, publisher = {ACL}, organization = {ACL}, abstract = {

Discussion forums serve as a platform for student discussions in massive open online courses (MOOCs). Analyzing content in these forums can uncover useful information for improving student retention and help in initiating instructor intervention. In this work, we explore the use of topic models, particularly seeded topic models toward this goal. We demonstrate that features derived from topic analysis help in predicting student survival.

}, keywords = {LDA, MOOC Discussion Forums, Seeded LDA, structured prediction}, author = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal Daume III and Lise Getoor} } @conference {huang:sbp2013, title = {A Flexible Framework for Probabilistic Models of Social Trust}, booktitle = {SBP}, year = {2013}, abstract = {

In social networks, notions such as trust, fondness, or respect between users can be expressed by associating a strength with each tie. This provides a view of social interaction as a weighted graph. Sociological models for such weighted networks can differ significantly in their basic motivations and intuitions. In this paper, we present a flexible framework for probabilistic modeling of social networks that allows one to represent these different models and more. The framework, probabilistic soft logic (PSL), is particularly well-suited for this domain, as it combines a declarative, first-order logic-based syntax for describing relational models with a soft-logic representation, which maps naturally to the non-discrete strength of social trust. We demonstrate the flexibility and effectiveness of PSL for trust prediction using two different approaches: a structural balance model based on social triangles, and a social status model based on a consistent status hierarchy. We test these models on real social network data and find that PSL is an effective tool for trust prediction.

}, author = {Bert Huang and Angelika Kimmig and Lise Getoor and Jennifer Golbeck} } @conference {london:sptli13, title = {Collective Activity Detection using Hinge-loss Markov Random Fields}, booktitle = {CVPR Workshop on SPTLE}, year = {2013}, abstract = {

We propose hinge-loss Markov random fields (HL-MRFs), a powerful class of continuous-valued graphical models, for high-level computer vision tasks. HL-MRFs are characterized by log-concave density functions, and are able to perform efficient, exact inference. Their templated hinge-loss potential functions naturally encode soft-valued logical rules. Using the declarative modeling language probabilistic soft logic, one can easily define HL-MRFs via familiar constructs from first-order logic. We apply HL-MRFs to the task of activity detection, using principles of collective classification. Our model is simple, intuitive and interpretable. We evaluate our model on two datasets and show that it achieves significant lift over the low-level detectors.

}, author = {Ben London and Sameh Khamis and Stephen Bach and Bert Huang and Lise Getoor and Larry Davis} } @book {london:book13, title = {Collective Classification of Network Data}, series = {Data Classification: Algorithms and Applications}, volume = {1}, year = {2013}, note = {May differ from the published version}, pages = {399--416}, publisher = {CRC Press}, organization = {CRC Press}, edition = {1}, chapter = {15}, author = {Ben London and Lise Getoor}, editor = {Charu Aggarwal} } @conference {fakhraei:mlcb2013, title = {Collective Inference and Multi-Relational Learning for Drug{\textendash}Target Interaction Prediction}, booktitle = {NIPS Workshop on MLCB}, year = {2013}, abstract = {

State-of-the-art methods for drug-target interaction prediction make use of interaction networks, drug similarities, and target similarities. In this paper we study the importance of multi-relational and collective prediction in these domains. We implement different models with probabilistic soft logic (PSL) to empirically show the effect of each assumption on prediction performance and demonstrate that a model using collective inference and combination of similarities significantly outperforms other models. In other words, we show the superiority of the models that combine multiple heterogeneous evidence and take advantage of the relational structure of the data.

}, author = {Shobeir Fakhraei and Bert Huang and Lise Getoor} } @conference {london:icml13, title = {Collective Stability in Structured Prediction: Generalization from One Example}, booktitle = {ICML}, year = {2013}, abstract = {

Structured predictors enable joint inference over multiple interdependent output variables. These models are often trained on a small number of examples with large internal structure. Existing distribution-free generalization bounds do not guarantee generalization in this setting, though this contradicts a large body of empirical evidence from computer vision, natural language processing, social networks and other fields. In this paper, we identify a set of natural conditions {\textendash} weak dependence, hypothesis complexity and a new measure, collective stability {\textendash} that are sufficient for generalization from even a single example, without imposing an explicit generative model of the data. We then demonstrate that the complexity and stability conditions are satisfied by a broad class of models, including marginal inference in templated graphical models. We thus obtain uniform convergence rates that can decrease significantly faster than previous bounds, particularly when each structured example is sufficiently large and the number of training examples is constant, even one.

}, author = {Ben London and Bert Huang and Benjamin Taskar and Lise Getoor} } @conference {fakhraei:biokdd13, title = {Drug-Target Interaction Prediction for Drug Repurposing with Probabilistic Similarity Logic}, booktitle = {KDD Workshop on BIOKDD}, year = {2013}, publisher = {ACM}, organization = {ACM}, abstract = {

The high development cost and low success rate of drug discovery from new compounds highlight the need for methods to discover alternate therapeutic effects for currently approved drugs. Computational methods can be effective in focusing efforts for such drug repurposing. In this paper, we propose a novel drug-target interaction prediction framework based on probabilistic similarity logic (PSL) [5]. Interaction prediction corresponds to link prediction in a bipartite network of drug-target interactions extended with a set of similarities between drugs and between targets. Using probabilistic first-order logic rules in PSL, we show how rules describing link predictions based on triads and tetrads can effectively make use of a variety of similarity measures. We learn weights for the rules based on training data, and report relative importance of each similarity for interaction prediction. We show that the learned rule weights significantly improve prediction precision. We evaluate our results on a dataset of drug-target interactions obtained from Drugbank [27] augmented with five drug-based and three target-based similarities. We integrate domain knowledge in drug-target interaction prediction and match the performance of the state-of-the-art drug-target interaction prediction systems [22] with our model using simple triad-based rules. Furthermore, we apply techniques that make link prediction in PSL more efficient for drug-target interaction prediction.

}, author = {Shobeir Fakhraei and Louiqa Raschid and Lise Getoor} } @conference {huang:slg13, title = {Empirical Analysis of Collective Stability}, booktitle = {ICML Workshop on SLG}, year = {2013}, abstract = {

When learning structured predictors, collective stability is an important factor for generalization. London et al. (2013) provide the first analysis of this effect, proving that collectively stable hypotheses produce less deviation between empirical risk and true risk, i.e., defect. We test this effect empirically using a collectively stable variant of maxmargin Markov networks. Our experiments on webpage classification validate that increasing the collective stability reduces the defect and can thus lead to lower overall test error

}, author = {Bert Huang and Ben London and Benjamin Taskar and Lise Getoor} } @conference {getoor:kdd13, title = {Entity Resolution in Big Data}, booktitle = {KDD}, year = {2013}, note = {Slides: http://www.umiacs.umd.edu/~getoor/Tutorials/ER_KDD2013.pdf}, abstract = {

Entity resolution (ER), the problem of extracting, matching and resolving entity mentions in structured and unstructured data, is a long-standing challenge in database management, information retrieval, machine learning, natural language processing and statistics. Accurate and fast entity resolution has huge practical implications in a wide variety of commercial, scientific and security domains. Despite the long history of work on entity resolution, there is still a surprising diversity of approaches, and lack of guiding theory. Meanwhile, in the age of big data, the need for high quality entity resolution is growing, as we are inundated with more and more data, all of which needs to be integrated, aligned and matched, before further utility can be extracted. In this tutorial, we bring together perspectives on entity resolution from a variety of fields, including databases, information retrieval, natural language processing and machine learning, to provide, in one setting, a survey of a large body of work. We discuss both the practical aspects and theoretical underpinnings of ER. We describe existing solutions, current challenges and open research problems. In addition to giving attendees a thorough understanding of existing ER models, algorithms and evaluation methods, the tutorial will cover important research topics such as scalable ER, active and lightly supervised ER, and query-driven ER.

}, author = {Lise Getoor and Ashwin Machanavajjhala} } @conference {moustafa:sigmod13, title = {GrDB: A System for Declarative and Interactive Analysis of Noisy Information Networks}, booktitle = {SIGMOD}, year = {2013}, abstract = {

There is a growing interest in methods for analyzing data describing networks of all types, including biological, physical, social, and scientific collaboration networks. Typically the data describing these networks is observational, and thus noisy and incomplete; it is often at the wrong level of fidelity and abstraction for meaningful data analysis. This demonstration presents GrDB, a system that enables data analysts to write declarative programs to specify and combine different network data cleaning tasks, visualize the output, and engage in the process of decision review and correction if necessary. The declarative interface of GrDB makes it very easy to quickly write analysis tasks and execute them over data, while the visual component facilitates debugging the program and performing fine grained corrections.

}, author = {Walaa Moustafa and Hui Miao and Amol Deshpande and Lise Getoor} } @unpublished {london:arxiv13a, title = {Graph-based Generalization Bounds for Learning Binary Relations}, year = {2013}, note = {http://arxiv.org/abs/1302.5348}, publisher = {University of Maryland College Park}, author = {London, Ben and Huang, Bert and Lise Getoor} } @conference {bach:uai13, title = {Hinge-loss Markov Random Fields: Convex Inference for Structured Prediction}, booktitle = {Uncertainty in Artificial Intelligence}, year = {2013}, abstract = {

Graphical models for structured domains are powerful tools, but the computational complexities of combinatorial prediction spaces can force restrictions on models, or require approximate inference in order to be tractable. Instead of working in a combinatorial space, we use hinge-loss Markov random fields (HL-MRFs), an expressive class of graphical models with log-concave density functions over continuous variables, which can represent confidences in discrete predictions. This paper demonstrates that HLMRFs are general tools for fast and accurate structured prediction. We introduce the first inference algorithm that is both scalable and applicable to the full class of HL-MRFs, and show how to train HL-MRFs with several learning algorithms. Our experiments show that HL-MRFs match or surpass the predictive performance of state-of-the-art methods, including discrete models, in four application domains.

}, author = {Bach, Stephen H. and Huang, Bert and London, Ben and Lise Getoor} } @conference {miao:bigdata13, title = {A Hypergraph-Partitioned Vertex Programming Approach for Large-scale Consensus Optimization}, booktitle = {2013 IEEE International Conference on Big Data}, year = {2013}, author = {Miao, Hui and Liu, Xiangyang and Huang, Bert and Lise Getoor} } @conference {pujara:wtbudg13, title = {Joint Judgments with a Budget: Strategies for Reducing the Cost of Inference}, booktitle = {ICML Workshop on Machine Learning with Test-Time Budgets}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor} } @conference {pujara:iswc13, title = {Knowledge Graph Identification}, booktitle = {International Semantic Web Conference (ISWC)}, year = {2013}, note = {Winner of Best Student Paper award}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {kang:sbp13, title = {LA-LDA: A Limited Attention Topic Model for Social Recommendation}, booktitle = {The 2013 International Conference on Social Computing, Behavioral-Cultural Modeling, \& Prediction (SBP 2013)}, year = {2013}, author = {Kang, Jeonhyung and Lerman, Kristina and Lise Getoor} } @conference {pujara:slg13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {ICML Workshop on Structured Learning (SLG)}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {pujara:sbd13, title = {Large-Scale Knowledge Graph Identification using PSL}, booktitle = {AAAI Fall Symposium on Semantics for Big Data}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {bach:fna13, title = {Large-margin Structured Learning for Link Ranking}, booktitle = {NIPS Workshop on Frontiers of Network Analysis: Methods, Models, and Applications}, year = {2013}, note = {Winner of Best Student Paper award}, author = {Bach, Stephen H. and Huang, Bert and Lise Getoor} } @conference {bach:inferning13, title = {Learning Latent Groups with Hinge-loss Markov Random Fields}, booktitle = {ICML Workshop on Inferning: Interactions between Inference and Learning}, year = {2013}, author = {Bach, Stephen H. and Huang, Bert and Lise Getoor} } @conference {ramesh:nipsws13, title = {Modeling Learner Engagement in MOOCs using Probabilistic Soft Logic}, booktitle = {NIPS Workshop on Data Driven Education}, year = {2013}, author = {Ramesh, Arti and Goldwasser, Dan and Huang, Bert and Daume III, Hal and Lise Getoor} } @unpublished {london:arxiv13b, title = {Multi-relational Learning Using Weighted Tensor Decomposition with Modular Loss}, year = {2013}, note = {http://arxiv.org/abs/1303.1733}, publisher = {University of Maryland College Park}, author = {London, Ben and Rekatsinas, Theodoros and Huang, Bert and Lise Getoor} } @conference {pujara:akbc13, title = {Ontology-Aware Partitioning for Knowledge Graph Identification}, booktitle = {CIKM Workshop on Automatic Knowledge Base Construction}, year = {2013}, author = {Pujara, Jay and Miao, Hui and Lise Getoor and Cohen, William} } @conference {london:nips13ws, title = {PAC-Bayes Generalization Bounds for Randomized Structured Prediction}, booktitle = {NIP Workshop on Perturbation, Optimization and Statistics}, year = {2013}, author = {London, Ben and Huang, Bert and Benjamin Taskar and Lise Getoor} } @conference {kimmig:probprog12, title = {A Short Introduction to Probabilistic Soft Logic}, booktitle = {NIPS Workshop on PPFA}, year = {2012}, abstract = {

Probabilistic soft logic (PSL) is a framework for collective, probabilistic reasoning in relational domains. PSL uses first order logic rules as a template language for graphical models over random variables with soft truth values from the interval [0; 1]. Inference in this setting is a continuous optimization task, which can be solved efficiently. This paper provides an overview of the PSL language and its techniques for inference and weight learning. An implementation of PSL is available at http://psl.umiacs.umd.edu/.

}, author = {Angelika Kimmig and Stephen Bach and Matthias Broecheler and Bert Huang and Lise Getoor} } @conference {moustafa:icde12, title = {Ego-centric Graph Pattern Census}, booktitle = {International Conference on Data Engineering (ICDE)}, year = {2012}, author = {Moustafa, Walaa Eldin and Deshpande, Amol and Lise Getoor} } @conference {getoor:vldb12, title = {Entity Resolution: Theory, Practice \& Open Challenges}, booktitle = {International Conference on Very Large Data Bases}, year = {2012}, note = {Slides: http://www.cs.umd.edu/~getoor/Tutorials/ER_VLDB2012.pdf}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @conference {getoor:aaai12t, title = {Entity Resolution: Theory, Practice, and Open Challenges}, booktitle = {AAAI Conference on Artificial Intelligence}, year = {2012}, note = {URL: http://www.cs.umd.edu/projects/linqs/Tutorials/ER-AAAI12/Home.html}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @conference {getoor:asonam12t, title = {Entity Resolution for Social Network Analysis and Mining}, booktitle = {IEEE ACM International Conference on Advances in Social Networks Analysis and Mining}, year = {2012}, note = {URL: http://www.cs.umd.edu/~getoor/Tutorials/ER_ASONAM2012.pdf}, publisher = {IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining}, organization = {IEEE/ACM International Conference on Advances in Social Networks Analysis and Mining}, author = {Lise Getoor and Machanavajjhala, Ashwin} } @article {sharara:hj12, title = {Finding Prominent Actors in Dynamic Affiliation Networks}, journal = {Human Journal}, year = {2012}, note = {Best Paper Award in ASE Conference 2012}, author = {Sharara, Hossam and Singh, Lisa and Lise Getoor} } @conference {memory:ursw12, title = {Graph Summarization in Annotated Data Using Probabilistic Soft Logic}, booktitle = {Proceedings of the International Workshop on Uncertainty Reasoning for the Semantic Web (URSW)}, year = {2012}, author = {Memory, Alex and Kimmig, Angelika and Bach, Stephen H. and Raschid, Louiqa and Lise Getoor} } @conference {london:nips12asalsn, title = {Improved Generalization Bounds for Large-scale Structured Prediction}, booktitle = {NIPS Workshop on Algorithmic and Statistical Approaches for Large Social Networks}, year = {2012}, author = {London, Ben and Huang, Bert and Lise Getoor} } @conference {rekatsinas:sigmod12, title = {Local Structure and Determinism in Probabilistic Databases}, booktitle = {SIGMOD}, year = {2012}, abstract = {

While extensive work has been done on evaluating queries over tuple-independent probabilistic databases, query evaluation over correlated data has received much less attention even though the support for correlations is essential for many natural applications of probabilistic databases, e.g., information extraction, data integration, computer vision, etc. In this paper, we develop a novel approach for efficiently evaluating probabilistic queries over correlated databases where correlations are represented using a factor graph, a class of graphical models widely used for capturing correlations and performing statistical inference. Our approach exploits the specific values of the factor parameters and the determinism in the correlations, collectively called local structure, to reduce the complexity of query evaluation. Our framework is based on arithmetic circuits, factorized representations of probability distributions that can exploit such local structure. Traditionally, arithmetic circuits are generated following a compilation process and can not be updated directly. We introduce a generalization of arithmetic circuits, called annotated arithmetic circuits, and a novel algorithm for updating them, which enables us to answer probabilistic queries efficiently. We present a comprehensive experimental analysis and show speed-ups of at least one order of magnitude in many cases.

}, author = {Theodoros Rekatsinas and Amol Deshpande and Lise Getoor} } @conference {london:nips12spectral, title = {Multi-relational Weighted Tensor Decomposition}, booktitle = {NIPS Workshop on SL}, year = {2012}, author = {Ben London and Theodoros Rekatsinas and Bert Huang and Lise Getoor} } @book {zheleva:morganclaypool12, title = {Privacy in Social Networks}, series = {Synthesis Lectures on Data Mining and Knowledge Discovery}, volume = {4}, year = {2012}, publisher = {Morgan \& Claypool Publishers}, organization = {Morgan \& Claypool Publishers}, abstract = {

This synthesis lecture provides a survey of work on privacy in online social networks (OSNs). This work encompasses concerns of users as well as service providers and third parties. Our goal is to approach such concerns from a computer-science perspective, and building upon existing work on privacy, security, statistical modeling and databases to provide an overview of the technical and algorithmic issues related to privacy in OSNs. We start our survey by introducing a simple OSN data model and describe common statistical-inference techniques that can be used to infer potentially sensitive information. Next, we describe some privacy definitions and privacy mechanisms for data publishing. Finally, we describe a set of recent techniques for modeling, evaluating, and managing individual users{\textquoteright} privacy risk within the context of OSNs.

}, author = {Elena Zheleva and Evimaria Terzi and Lise Getoor} } @conference {huang:starai12, title = {Probabilistic Soft Logic for Trust Analysis in Social Networks}, booktitle = {UAI Workshop on StaRAI }, year = {2012}, abstract = {

Trust plays a key role in social interactions. Explicitly modeling trust is therefore an important aspect of social network analysis in settings such as reputation management systems, recommendation systems, and viral marketing. Within the social sciences, trust is known to depend on network structure, context, individual actors{\textquoteright} attributes, and group memberships and affiliations. Furthermore, trust is often measured quantitatively, according to degrees of trust, rather than as a binary indicator. In this paper, we propose trust modeling as a rich challenge for statistical relational learning (SRL). Additionally, we show that probabilistic soft logic (PSL) is particularly well-suited for this problem. PSL, like many SRL languages, provides an intuitive framework for capturing the relational aspects of trust modeling, while its soft truth values easily accommodate varying strengths of trust. We model various sociological theories of trust in PSL and experimentally compare the resulting PSL programs to existing trust prediction methods, demonstrating the ease of model development and showing that these interpretable first-order logic models produce results of competitive quality.

}, author = {Bert Huang and Angelika Kimmig and Lise Getoor and Jennifer Golbeck} } @conference {namata:mlg12-wkshp, title = {Query-driven Active Surveying for Collective Classification}, booktitle = {ICML Workshop on MLG}, year = {2012}, abstract = {

In network classification problems such as those found in intelligence gathering, public health, and viral marketing, one is often only interested in inferring the labels of a subset of the nodes. We refer to this subset as the query set, and define the problem as query-driven collective classification. We study this problem in a practical active learning framework, in which the learning algorithm can survey non-query nodes to obtain their labels and network structure. We derive a surveying strategy aimed toward optimal inference on the query set. Considering both feature and structural smoothness, concepts that we formally define, we develop an algorithm which adaptively selects survey nodes by estimating which form of smoothness is most appropriate. We evaluate our algorithm on several network datasets and demonstrate its improvements over standard active learning methods.

}, author = {Galileo Namata and Ben London and Lise Getoor and Bert Huang} } @conference {bach:nips12, title = {Scaling MPE Inference for Constrained Continuous Markov Random Fields with Consensus Optimization}, booktitle = {NeuRIPS}, year = {2012}, abstract = {

Probabilistic graphical models are powerful tools for analyzing constrained, continuous domains. However, finding most-probable explanations (MPEs) in these models can be computationally expensive. In this paper, we improve the scalability of MPE inference in a class of graphical models with piecewise-linear and piecewise-quadratic dependencies and linear constraints over continuous domains. We derive algorithms based on a consensus-optimization framework and demonstrate their superior performance over state of the art. We show empirically that in a large-scale voter-preference modeling problem our algorithms scale linearly in the number of dependencies and constraints

}, author = {Stephen Bach and Matthias Broecheler and Lise Getoor and Dianne O{\textquoteright}Leary} } @conference {huang:social2012, title = {Social Group Modeling with Probabilistic Soft Logic}, booktitle = {NeuRIPS Workshop on SNSMA}, year = {2012}, abstract = {

In this work, we show how to model the group affiliations of social media users using probabilistic soft logic. We consider groups of a broad variety, motivated by ideas from the social sciences on groups and their roles in social identity. By modeling group affiliations, we allow the possibility of efficient higher-level relational reasoning about the groups themselves, where the number of groups is relatively small compared to the number of users. We discuss preliminary results from experiments using real social media data collected from Twitter.

}, author = {Huang Bert and Bach Stephen and Norris Eric and Pujara Jay and Lise Getoor} } @conference {sharara:ase12, title = {Stability vs. Diversity: Understanding the Dynamics of Actors in Time-varying Affiliation Networks}, booktitle = {ICSI}, year = {2012}, chapter = {1--6}, abstract = {

Most networks contain embedded communities or groups that impact the overall gathering and dissemination of ideas and information. These groups consist of important or prominent individuals who actively participate in network activities over time. In this paper, we introduce a new method for identifying actors with prominent group memberships in timevarying affiliation networks. We define a prominent actor to be one who participates in the same group regularly (stable participation) and participates across different groups consistently (diverse participation), thereby having a position of structural influence in the network. Our proposed methods for quantifying stable and diverse participation takes into consideration the underlying semantics for group participation as well as the level of impact of an actor{\textquoteright}s history on his or her current behavior. We illustrate the semantics of our measures on real-world data sets with varying temporal connectivity structures.

}, author = {Sharara Hossam and Singh Lisa and Lise Getoor and Mann Janet} } @article {getoor:tkde12b, title = {TACI: Taxonomy-Aware Catalog Integration}, journal = {TKDE}, volume = {25}, year = {2012}, chapter = {1643--1655}, abstract = {

A fundamental data integration task faced by online commercial portals and commerce search engines is the integration of products coming from multiple providers to their product catalogs. In this scenario, the commercial portal has its own taxonomy (the {\textquotedblleft}master taxonomy{\textquotedblright}), while each data provider organizes its products into a different taxonomy (the {\textquotedblleft}provider taxonomy{\textquotedblright}). In this paper, we consider the problem of categorizing products from the data providers into the master taxonomy, while making use of the provider taxonomy information. Our approach is based on a taxonomy-aware processing step that adjusts the results of a text-based classifier to ensure that products that are close together in the provider taxonomy remain close in the master taxonomy. We formulate this intuition as a structured prediction optimization problem. To the best of our knowledge, this is the first approach that leverages the structure of taxonomies in order to enhance catalog integration. We propose algorithms that are scalable and thus applicable to the large datasets that are typical on the Web. We evaluate our algorithms on real-world data and we show that taxonomy-aware classification provides a significant improvement over existing approaches.

}, author = {Papadimitriou Panagiotis and Tsaparas Panayiotis and Fuxman Ariel and Lise Getoor} } @conference {ramesh:nips12, title = {User Role Prediction in Online Discussion Forums using Probabilistic Soft Logic}, booktitle = {NeuRIPS Workshop on PE}, year = {2012}, author = {Ramesh Arti and Yoo Jaebong and Shen Shitian and Lise Getoor and Kim Jihie} } @conference {chen:wpov11, title = {Active Inference for Retrieval in Camera Networks}, booktitle = {IEEE Workshop on Person-Oriented Vision}, year = {2011}, abstract = {

We address the problem of searching camera network videos to retrieve frames containing specified individuals.We show the benefit of utilizing a learned probabilistic model that captures dependencies among the cameras. Inaddition, we develop an active inference framework that can request human input at inference time, directing human attention to the portions of the videos whose correct annotation would provide the biggest performance improvements. Our primary contribution is to show that by mappingvideo frames in a camera network onto a graphical model,we can apply collective classification and active inference algorithms to significantly increase the performance of the retrieval system, while minimizing the number of human annotations required.

}, author = {Chen Daozheng and Bilgic Mustafa and Lise Getoor and Jacobs David and Mihalkova Lilyana and Yeh Tom} } @conference {sharara:ijcai11, title = {Active Surveying: A Probabilistic Approach for Identifying Key Opinion Leaders}, booktitle = {IJCAI}, year = {2011}, abstract = {

Opinion leaders play an important role in influencingpeople{\textquoteright}s beliefs, actions and behaviors. Althougha number of methods have been proposedfor identifying influentials using secondary sourcesof information, the use of primary sources, suchas surveys, is still favored in many domains. Inthis work we present a new surveying methodwhich combines secondary data with partial knowledgefrom primary sources to guide the informationgathering process. We apply our proposed activesurveying method to the problem of identifying keyopinion leaders in the medical field, and show howwe are able to accurately identify the opinion leaderswhile minimizing the amount of primary datarequired, which results in significant cost reductionin data acquisition without sacrificing its integrity.

}, author = {Sharara Hossam and Lise Getoor and Norton Myra} } @conference {namata:kdd11, title = {Collective Graph Identification}, booktitle = {KDD}, year = {2011}, abstract = {

Data describing networks (communication networks, transaction networks, disease transmission networks, collaboration networks, etc.) is becoming increasingly ubiquitous.While this observational data is useful, it often only hintsat the actual underlying social or technological structureswhich give rise to the interactions. For example, an emailcommunication network provides useful insight but is notthe same as the {\textquotedblleft}real{\textquotedblright} social network among individuals. Inthis paper, we introduce the problem of graph identification,i.e., the discovery of the true graph structure underlyingan observed network. We cast the problem as a probabilistic inference task, in which we must infer the nodes, edges,and node labels of a hidden graph, based on evidence provided by the observed network. This in turn correspondsto the problems of performing entity resolution, link prediction, and node labeling to infer the hidden graph. Whileeach of these problems have been studied separately, theyhave never been considered together as a coherent task. Wepresent a simple yet novel approach to address all three problems simultaneously. Our approach, called C3, consists ofCoupled Collective Classifiers that are iteratively appliedto propagate information among solutions to the problems.We empirically demonstrate that C3is superior, in termsof both predictive accuracy and runtime, to state-of-the-artprobabilistic approaches on three real-world problems.

}, author = {Namata, Galileo and Kok, Stanley and Lise Getoor} } @conference {moustafa:gdm11, title = {Declarative Analysis of Noisy Information Networks}, booktitle = {ICDE Workshop on GDM}, year = {2011}, abstract = {

There is a growing interest in methods for analyzing data describing networks of all types, including information, biological, physical, and social networks. Typically the data describing these networks is observational, and thus noisy and incomplete; it is often at the wrong level of fidelity and abstraction for meaningful data analysis. This has resulted in a growing body of work on extracting, cleaning, and annotating network data. Unfortunately, much of this work is ad hoc and domain-specific. In this paper, we present the architecture of a data management system that enables efficient, declarative analysis of large-scale information networks. We identify a set of primitives to support the extraction and inference of a network from observational data, and describe a framework that enables a network analyst to easily implement and combine new extraction and analysis techniques, and efficiently apply them to large observation networks. The key insight behind our approach is to decouple, to the extent possible, (a) the operations that require traversing the graph structure (typically the computationally expensive step), from (b) the operations that do the modification and update of the extracted network. We present an analysis language based on Datalog, and show how to use it to cleanly achieve such decoupling. We briefly describe our prototype system that supports these abstractions. We include a preliminary performance evaluation of the system and show that our approach scales well and can efficiently handle a wide spectrum of data cleaning operations on network data.

}, author = {Moustafa, Walaa and Namata, Galileo and Deshpande, Amol and Lise Getoor} } @conference {sharara:icwsm11, title = {Differential Adaptive Diffusion: Understanding Diversity and Learning whom to Trust in Viral Marketing}, booktitle = {ICWSM}, year = {2011}, abstract = {

Viral marketing mechanisms use the existing social network between customers to spread information about products and encourage product adoption. Existing viral marketing modelsfocus on the dynamics of the diffusion process, however theytypically: (a) only consider a single product campaign and (b)fail to model the evolution of the social network, as the trustbetween individuals changes over time, during the course ofmultiple campaigns. In this work, we propose an adaptive viralmarketing model which captures: (1) multiple differentproduct campaigns, (2) the diversity in customer preferencesamong different product categories, and (3) changing confidencein peers{\textquoteright} recommendations over time. By applyingour model to a real-world network extracted from the Diggsocial news website, we provide insights into the effects ofnetwork dynamics on the different products{\textquoteright} adoption. Ourexperiments show that our proposed model outperforms earliernon-adaptive diffusion models in predicting future productadoptions. We also show how this model can be used toexplore new viral marketing strategies that are more successfulthan classic strategies which ignore the dynamic nature ofsocial networks.

}, author = {Sharara, Hossam and Rand, William and Lise Getoor} } @article {chen:pami11, title = {Dynamic Processing Allocation in Video}, journal = {PAMI}, volume = {33}, number = {11}, year = {2011}, pages = {2174-2187}, abstract = {

Large stores of digital video pose severe computational challenges to existing video analysis algorithms. In applying these algorithms, users must often trade off processing speed for accuracy, as many sophisticated and effective algorithms require large computational resources that make it impractical to apply them throughout long videos. One can save considerable effort by applying these expensive algorithms sparingly, directing their application using the results of more limited processing. We show how to do this for retrospective video analysis by modeling a video using a chain graphical model and performing inference both to analyze the video and to direct processing. We apply our method to problems in background subtraction and face detection, and show in experiments that this leads to significant improvements over baseline algorithms.

}, author = {Chen Daozheng and Bilgic Mustafa and Lise Getoor and Jacobs David} } @article {getoor:sdm11tutorial, title = {Exploiting Statistical and Relational Information on the Web and in Social Media}, year = {2011}, keywords = {Statistical relational learning, social media, tutorial, web}, author = {Lise Getoor and Mihalkova, Lilyana} } @conference {sharara:vast11, title = {G-PARE: A Visual Analytic Tool for Comparative Analysis of Uncertain Graphs}, booktitle = {IEEE Conference on Visual Analytics Science and Technology (VAST)}, year = {2011}, keywords = {Comparative Analysis, Model Comparison, Uncertain Graphs, Visualizing Uncertainty}, author = {Sharara, Hossam and Sopan, Awalin and Namata, Galileo Mark and Lise Getoor and Singh, Lisa} } @conference {minton:cmla11, title = {Improving Classifier Performance by Autonomously Collecting Background Knowledge from the Web}, booktitle = {Tenth International Conference on Machine Learning and Applications}, year = {2011}, author = {Minton, Steve and Michelson, Matthew and See, Kane and Macskassy, Sofus and Gazen, Bora C. and Lise Getoor} } @conference {mihalkova:wsdm-wkshop11, title = {Learning to Predict Web Collaborations}, booktitle = {WSDM Workshop on User Modeling for Web Applications}, year = {2011}, author = {Mihalkova, Lilyana and Moustafa, Walaa Eldin and Lise Getoor} } @unpublished {mihalkova:arxive11, title = {Lifted Graphical Models: A Survey}, year = {2011}, note = {Arxiv preprint arXiv:1107.4966v2}, author = {Mihalkova, Lilyana and Lise Getoor} } @conference {sharara:sunbelt11, title = {Multi-dimensional Trajectory Analysis for Career Histories}, booktitle = {International Sunbelt Social Networks Conference (Sunbelt XXXI)}, year = {2011}, author = {Sharara, Hossam and Halgin, Daniel and Lise Getoor and Borgatti, Steve} } @book {zheleva:snda10, title = {Privacy in Social Networks: A Survey}, series = {Social Network Data Analytics}, volume = {1}, year = {2011}, month = {March}, pages = {247{\textendash}276}, publisher = {Springer}, organization = {Springer}, chapter = {10}, abstract = {

In this chapter, we survey the literature on privacy in social networks. We focus both on online social networks and online affiliation networks. We formally define the possible privacy breaches and describe the privacy attacks that have been studied. We present definitions of privacy in the context of anonymization together with existing anonymization techniques.

}, author = {Elena Zheleva and Lise Getoor}, editor = {Charu Aggarwal} } @conference {plangprasopchok:wsdm2011, title = {A Probabilistic Approach for Learning Folksonomies from Structured Data}, booktitle = {Fourth ACM International Conference on Web Search and Data Mining (WSDM)}, year = {2011}, author = {Plangprasopchok, Anon and Lerman, Kristina and Lise Getoor} } @conference {getoor:icml11, title = {Proceedings of the 28th International Conference on Machine Learning}, booktitle = {Proceedings of the 28th International Conference on Machine Learning}, year = {2011}, author = {Lise Getoor and Scheffer, Tobias} } @conference {pujara:icmlws11, title = {Reducing Label Cost by Combining Feature Labels and Crowdsourcing}, booktitle = {ICML Workshop on Combining Learning Strategies to Reduce Label Cost}, year = {2011}, author = {Pujara, Jay and London, Ben and Lise Getoor} } @article {sharara:snam10, title = {Understanding Actor Loyalty to Event-Based Groups in Affiliation Networks}, journal = {Journal of Advances in Social Networks Analysis and Mining}, volume = {1}, number = {2}, year = {2011}, month = {April}, pages = {115{\textendash}126}, author = {Sharara, Hossam and Singh, Lisa and Lise Getoor and Mann, Janet} } @conference {pujara:ceas11, title = {Using Classifier Cascades for Scalable E-Mail Classification}, booktitle = {Collaboration, Electronic Messaging, Anti-Abuse and Spam Conference}, series = {ACM International Conference Proceedings Series}, year = {2011}, note = {Winner of a Best Paper award}, publisher = {ACM}, organization = {ACM}, author = {Pujara, Jay and Daume III, Hal and Lise Getoor} } @article {bilgic:jair11, title = {Value of Information Lattice: Exploiting Probabilistic Independence for Effective Feature Subset Acquisition}, journal = {Journal of Artificial Intelligence Research (JAIR)}, volume = {41}, year = {2011}, pages = {69{\textendash}95}, author = {Bilgic, Mustafa and Lise Getoor} } @article {licamele:jbcb11, title = {A method for the detection of meaningful and reproducible group signatures from gene expression profiles.}, journal = {Journal of Bioinformatics and Computational Biology}, year = {2011}, author = {Licamele, Louis and Lise Getoor} } @book {namata:lmbook10, title = {A Survey of Link Mining Tasks for Analyzing Noisy and Incomplete Networks}, series = {Link Mining: Models, Algorithms, and Applications}, volume = {1}, year = {2010}, pages = {107--133}, publisher = {Springer}, organization = {Springer}, edition = {1}, chapter = {4}, abstract = {

Many data sets of interest today are best described as networks or graphs of interlinked entities. Examples include Web and text collections, social networks and social media sites, information, transaction and communication networks, and all manner of scientific networks, including biological networks. Unfortunately, often the data collection and extraction process for gathering these network data sets is imprecise, noisy, and/or incomplete. In this chapter, we review a collection of link mining algorithms that are well suited to analyzing and making inferences about networks, especially in the case where the data is noisy or missing.

}, author = {Galileo Namata and Hossam Sharara and Lise Getoor}, editor = {Philip Yu and Jiawei Han and Christos Faloutsos} } @conference {bilgic:aaai10, title = {Active Inference for Collective Classification}, booktitle = {Twenty-Fourth Conference on Artificial Intelligence (AAAI NECTAR Track)}, year = {2010}, pages = {1652{\textendash}1655}, author = {Bilgic, Mustafa and Lise Getoor} } @conference {sharara:win10, title = {An Active Learning Approach for Identifying Key Opinion Leaders}, booktitle = {The 2nd Workshop on Information in Networks (WIN)}, year = {2010}, author = {Sharara, Hossam and Lise Getoor and Norton, Myra} } @conference {bilgic:icml10, title = {Active Learning for Networked Data}, booktitle = {Proceedings of the 27th International Conference on Machine Learning (ICML-10)}, year = {2010}, author = {Bilgic, Mustafa and Mihalkova, Lilyana and Lise Getoor} } @conference {sharara:nips2010-nad, title = {Active Surveying}, booktitle = {NIPS Workshop on Networks Across Disciplines in Theory and Applications}, year = {2010}, author = {Sharara, Hossam and Lise Getoor and Norton, Myra} } @conference {sharara:sunbelt10, title = {Active Surveying for Leadership Identification}, booktitle = {The International Sunbelt Social Networks Conference XXX}, year = {2010}, author = {Sharara, Hossam and Norton, Myra and Lise Getoor} } @conference {pujara:nips10, title = {Coarse-to-Fine, Cost-Sensitive Classification of E-Mail}, booktitle = {NIPS Workshop on Coarse-to-Fine Processing}, year = {2010}, author = {Pujara, Jay and Lise Getoor} } @article {sen:eml10, title = {Collective Classification}, journal = {Encyclopedia of Machine Learning}, year = {2010}, author = {Sen, Prithviraj and Namata, Galileo Mark and Bilgic, Mustafa and Lise Getoor} } @conference {broecheler:nips10, title = {Computing marginal distributions over continuous Markov networks for statistical relational learning}, booktitle = {Advances in Neural Information Processing Systems (NIPS)}, year = {2010}, author = {Broecheler, Matthias and Lise Getoor} } @conference {bach:pmpm10, title = {Decision-Driven Models with Probabilistic Soft Logic}, booktitle = {NIPS Workshop on Predictive Models in Personalized Medicine}, year = {2010}, author = {Bach, Stephen H. and Broecheler, Matthias and Kok, Stanley and Lise Getoor} } @article {sharara:eml10, title = {Group Detection}, journal = {Encyclopedia of Machine Learning}, year = {2010}, author = {Sharara, Hossam and Lise Getoor} } @conference {plang:kdd10, title = {Growing a tree in the forest: constructing folksonomies by integrating structured metadata}, booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2010}, author = {Plangprasopchok, Anon and Lerman, Kristina and Lise Getoor} } @conference {zheleva:nips10, title = {Higher-order Graphical Models for Classification in Social and Affiliation Networks}, booktitle = {NIPS Workshop on Networks Across Disciplines: Theory and Applications}, year = {2010}, author = {Zheleva, Elena and Lise Getoor and Sarawagi, Sunita} } @article {licamele:bmcbio10, title = {Indirect two-sided relative ranking: a robust similarity measure for gene expression data}, journal = {BMC Bioinformatics}, year = {2010}, keywords = {gene expression bioinformatics drug therapeutics}, author = {Licamele, Louis and Lise Getoor} } @conference {doppa:ecml10, title = {Learning Algorithms for Link Prediction based on Chance Constraints}, booktitle = {European Conference on Machine Learning (ECML)}, year = {2010}, author = {Doppa, Janardhan and Yu, Jun and Tadepalli, Prasad and Lise Getoor} } @article {namata:eml10, title = {Link Prediction}, journal = {Encyclopedia of Machine Learning}, year = {2010}, author = {Namata, Galileo Mark and Lise Getoor} } @article {hwang:kde10, title = {Organizing User Search Histories}, journal = {IEEE Transactions on Knowledge and Data Engineering}, year = {2010}, author = {Hwang, Heasoo and Lauw, Hady and Lise Getoor and Ntoulas, Alexcandros} } @conference {broecheler:uai10, title = {Probabilistic Similarity Logic}, booktitle = {Conference on Uncertainty in Artificial Intelligence}, year = {2010}, author = {Broecheler, Matthias and Mihalkova, Lilyana and Lise Getoor} } @conference {sen:vldb10, title = {Read-Once Functions and Query Evaluation in Probabilistic Databases}, booktitle = {International Conference on Very Large Data Bases}, year = {2010}, author = {Sen, Prithviraj and Deshpande, Amol and Lise Getoor} } @conference {sen:uai09, title = {Bisimulation-based Approximate Lifted Inference}, booktitle = {Uncertainty in Artificial Intelligence}, year = {2009}, author = {Sen, Prithviraj and Deshpande, Amol and Lise Getoor} } @conference {doppa:nips09-wkshp, title = {Chance-Constrained Programs for Link Prediction}, booktitle = {NIPS Workshop on Analyzing Networks and Learning with Graphs}, year = {2009}, author = {Doppa, Janardhan and Yu, Jun and Tadepalli, Prasad and Lise Getoor} } @conference {zheleva:kdd09, title = {Co-evolution of Social and Affiliation Networks}, booktitle = {15th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)}, year = {2009}, month = {June}, author = {Zheleva, Elena and Sharara, Hossam and Lise Getoor} } @book {namata:tmbook09, title = {Collective Classification for Text Classification}, series = {Text Mining: Classification, Clustering, and Applications}, volume = {1}, year = {2009}, pages = {51--69}, publisher = {Taylor and Francis Group}, organization = {Taylor and Francis Group}, edition = {1}, chapter = {3}, abstract = {

Text classification, the classification of text documents according to categories or topics, is an important component of any text processing system. There is a large body of work which makes use of content{\textendash}the words appearing in the documents, the structure of the documents{\textendash}and external sources to build accurate document classifiers. In addition, there is a growing body of literature on methods which attempt to make use of the link structure among the documents in order to improve document classification performance. Text documents can be connected together in a variety of ways. The most common link structure is the citation graph: eg, papers cite other papers and webpages link to other webpages. But links among papers can be constructed from other relationships such as co-author, co-citation, appearance at a conference venue, and others. All of these can be combined together to create a interlinked collection of text documents. In these cases, we are often not interested in determining the topic of just a single document, but we have a collection of unlabeled (or partially labeled) documents, and we want to correctly infer values for all of the missing labels.

}, author = {Galileo Namata and Prithviraj Sen and Mustafa Bilgic and Lise Getoor}, editor = {Mehran Sahami and Ashok Srivastava} } @article {Polymeropoulos:SchizRes09, title = {Common effect of antipsychotics on the biosynthesis and regulation of fatty acids and cholesterol supports a key role of lipid homeostasis in schizophrenia.}, journal = {Schizophrenia Research}, year = {2009}, keywords = {bioinformatics gene expression analysis antipsychotic pharmacogenetics}, author = {Polymeropoulos, Mihales and Licamele, Louis and Volpi, Simona and Mack, Kendra and Mitkus, Shruti and Carstea, Eugene and Lise Getoor and Lavedan, Christian} } @conference {barash:wsm09, title = {Distinguishing Knowledge vs Social Capital in Social Media with Roles and Context}, booktitle = {International Conference on Weblogs and Social Media}, year = {2009}, month = {May}, author = {Barash, Vladimir and Smith, Marc and Lise Getoor and Welser, Howard} } @conference {sharara:asonam09, title = {The Dynamics of Actor Loyalty to Groups in Affiliation Networks}, booktitle = {International Conference on Advances in Social Networks Analysis and Mining}, year = {2009}, month = {July}, author = {Sharara, Hossam and Singh, Lisa and Lise Getoor and Mann, Janet} } @conference {chen:nips09-wkshp, title = {Efficient Resource-constrained Retrospective Analysis of Long Video Sequences}, booktitle = {NIPS Workshop on Adaptive Sensing, Active Learning and Experimental Design: Theory, Methods and Applications}, year = {2009}, author = {Chen, Daozheng and Bilgic, Mustafa and Lise Getoor and Jacobs, David} } @conference {sayyadi:sdm09, title = {Future Rank: Ranking Scientific Articles by Predicting their Future PageRank}, booktitle = {2009 SIAM International Conference on Data Mining (SDM09)}, year = {2009}, month = {April}, author = {Sayyadi, Hassan and Lise Getoor} } @book {deshpande:mmudchapter09, title = {Graphical Models for Uncertain Data}, series = {Managing and Mining Uncertain Data}, volume = {1}, year = {2009}, pages = {1--34}, publisher = {Springer}, organization = {Springer}, edition = {1}, chapter = {1}, abstract = {

Graphical models are a popular and well-studied framework for compact representation of a joint probability distribution over a large number of interdependent variables, and for efficient reasoning about such a distribution. They have been proven useful in a wide range of domains from natural language processing to computer vision to bioinformatics. In this chapter, we present an approach to using graphical models for managing and querying large-scale uncertain databases. We present a unified framework based on the concepts from graphical models that can model not only tuple-level and attribute-level uncertainties, but can also handle arbitrary correlations that may be present among the data; our framework can also naturally capture shared correlations where the same uncertainties and correlations occur repeatedly in the data. We develop an efficient strategy for query evaluation over such probabilistic databases by casting the query processing problem as an inference problem in an appropriately constructed graphical model, and present optimizations specific to probabilistic databases that enable efficient query evaluation. We conclude the chapter with a discussion of related and future work on these topics.

}, author = {Amol Deshpande and Lise Getoor and Prithviraj Sen}, editor = {Charu Aggarwal} } @conference {namata:kddu09, title = {Identifying Graphs From Noisy and Incomplete Data}, booktitle = {1st ACM SIGKDD Workshop on Knowledge Discovery from Uncertain Data}, year = {2009}, author = {Namata, Galileo Mark and Lise Getoor} } @conference {schnaitter:vldb09, title = {Index Interactions in Physical Design Tuning: Modeling, Analysis, and Applications}, booktitle = {International Conference on Very Large Data Bases}, year = {2009}, author = {Schnaitter, Karl and Polyzotis, Neoklis and Lise Getoor} } @conference {bilgic:nips09-wkshp, title = {Link-based Active Learning}, booktitle = {NIPS Workshop on Analyzing Networks and Learning with Graphs}, year = {2009}, author = {Bilgic, Mustafa and Lise Getoor} } @conference {saha:sdm09, title = {On Maximum Coverage in the Streaming Model \& Application to Multi-topic Blog-Watch}, booktitle = {2009 SIAM International Conference on Data Mining (SDM09)}, year = {2009}, month = {April}, author = {Saha, Barna and Lise Getoor} } @conference {somasundaran:textgraphs09, title = {Opinion Graphs for Polarity and Discourse Classification}, booktitle = {TextGraphs-4: Graph-based Methods for Natural Language Processing}, year = {2009}, month = {August}, author = {Somasundaran, Swapna and Namata, Galileo Mark and Lise Getoor and Wiebe, Janyce} } @conference {namata:mlg09, title = {A Pipeline Approach to Graph Identification}, booktitle = {Seventh International Workshop on Mining and Learning with Graphs}, year = {2009}, author = {Namata, Galileo Mark and Lise Getoor} } @article {sen:vldbj09, title = {PrDB: Managing and Exploiting Rich Correlations in Probabilistic Databases}, journal = {VLDB Journal, special issue on uncertain and probabilistic databases}, year = {2009}, author = {Sen, Prithviraj and Deshpande, Amol and Lise Getoor} } @conference {broecheler:srl09, title = {Probabilistic Similarity Logic}, booktitle = {International Workshop on Statistical Relational Learning (SRL{\textquoteright}09)}, year = {2009}, author = {Broecheler, Matthias and Lise Getoor} } @article {bilgic:tkdd09, title = {Reflect and Correct: A Misclassification Prediction Approach to Active Inference}, journal = {ACM Transactions on Knowledge Discovery from Data}, volume = {3}, number = {4}, year = {2009}, month = {November}, pages = {1{\textendash}32}, author = {Bilgic, Mustafa and Lise Getoor} } @conference {somasundaran:emnlp09, title = {Supervised and Unsupervised Methods in Employing Discourse Relations for Improving Opinion Polarity Classification}, booktitle = {Conference on Empirical Methods in Natural Language Processing}, year = {2009}, month = {August}, author = {Somasundaran, Swapna and Namata, Galileo Mark and Wiebe, Janyce and Lise Getoor} } @conference {zheleva:www09, title = {To Join or not to Join: The Illusion of Privacy in Social Networks with Mixed Public and Private User Profiles}, booktitle = {18th International World Wide Web conference (WWW)}, year = {2009}, note = {Earlier version appears as CS-TR-4926.}, month = {April}, keywords = {anonymity online, groups, privacy, sensitive attribute inference, social networks}, author = {Zheleva, Elena and Lise Getoor} } @conference {zheleva:umtr08, title = {To Join or not to Join: The Illusion of Privacy in Social Networks with Mixed Public and Private User Profiles}, booktitle = {The Web Conference (WWW)}, number = {CS-TR-4926}, year = {2009}, note = {An earlier version appears as CS-TR-4922, July 2008}, publisher = {University of Maryland}, organization = {University of Maryland}, address = {College Park}, keywords = {anonymity online, groups, privacy, sensitive attribute inference, social networks}, author = {Zheleva, Elena and Lise Getoor} } @book {islamaj:fga-book07, title = {A Feature Generation Algorithm with Applications to Biological Sequence Classification}, series = {Computational Methods of Feature Selection}, volume = {1}, year = {2008}, pages = {355--376}, publisher = {Chapman and Hall/CRC Press}, organization = {Chapman and Hall/CRC Press}, edition = {1}, chapter = {18}, author = {Rezarta Islamaj and Lise Getoor and John Wilbur}, editor = {Huan Liu and Hiroshi Motoda} } @article {sen:aimag08, title = {Collective Classification in Network Data}, journal = {AI Magazine}, volume = {29}, number = {3}, year = {2008}, pages = {93{\textendash}106}, author = {Sen, Prithviraj and Namata, Galileo Mark and Bilgic, Mustafa and Lise Getoor and Gallagher, Brian and Eliassi-Rad, Tina} } @book {bhattacharya:ch08, title = {Collective Relational Clustering}, series = {Constrained Clustering: Advances in Algorithms, Theory, and Applications}, volume = {1}, year = {2008}, pages = {221-244}, publisher = {Chapman and Hall}, organization = {Chapman and Hall}, edition = {1}, chapter = {10}, abstract = {

Abstract In many clustering problems, in addition to attribute data, we have relational information, linking different data points. In this chapter, we focus on the problem of collective relational clustering that makes use of both attribute and relational information. The approach is collective in that clustering decisions are not taken in an independent fashion for each pair of data points. Instead, the different pairwise decisions depend on each other. The first set of dependencies is among multiple decisions involving the same data point. The other set of dependencies come from the relationships. Decisions for any two references that are related in the data are also dependent on each other. Hence, the approach is collective as well as relational. We focus on the entity resolution problem as an application of the clustering problem, and we survey different proposed approaches that are collective or make use of relationships. One of the approaches is an agglomerative greedy clustering algorithm where the cluster similarity measure combines both attributes and relationships in a collective way. We discuss the algorithmic details of this approach and identifying data characteristics that influence its correctness. We also present experimental results on multiple real-world and syntheticOften in clustering problems, in addition to the attributes describing the data items to be clustered, there are links among the items. These links are co-occurrence links indicating that the data items were observed together in, for example, a market basket, a text document, or some other relational context. Relational clustering approaches make use of both the attributes of the instances and the observed co-occurrences to do a better job at clustering.

}, author = {Indrajit Bhattacharya and Lise Getoor} } @article {sen:dmkd08, title = {Cost-Sensitive Learning with Conditional Markov Networks}, journal = {Data Mining and Knowledge Discovery, Special Issue on Utility Based Data Mining}, volume = {17}, number = {2}, year = {2008}, month = {October}, pages = {136{\textendash}163}, author = {Sen, Prithviraj and Lise Getoor} } @conference {bilgic:kdd08, title = {Effective Label Acquisition for Collective Classification}, booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2008}, note = {Winner of the KDD{\textquoteright}08 Best Student Paper Award.}, pages = {43{\textendash}51}, author = {Bilgic, Mustafa and Lise Getoor} } @conference {sen:vldb08, title = {Exploiting Shared Correlations in Probabilistic Databases}, booktitle = {International Conference on Very Large Data Bases}, year = {2008}, author = {Sen, Prithviraj and Deshpande, Amol and Lise Getoor} } @conference {saha:snakdd08, title = {Group Proximity Measure for Recommending Groups in Online Social Networks}, booktitle = {2nd ACM SIGKDD Workshop on Social Network Mining and Analysis (SNA-KDD)}, year = {2008}, author = {Saha, Barna and Lise Getoor} } @article {kang:tvcg08, title = {Interactive Entity Resolution in Relational Data: A Visual Analytic Tool and Its Evaluation}, journal = {IEEE Transactions on Visualization and Computer Graphics}, volume = {14}, number = {5}, year = {2008}, pages = {999{\textendash}1014}, author = {Kang, Hyunmo and Lise Getoor and Shneiderman, Ben and Bilgic, Mustafa and Licamele, Louis} } @conference {smith:cikm07-ssm, title = {Leveraging Social Context for Searching Social Media}, booktitle = {CIKM Workshop on Search in Social Media}, year = {2008}, author = {Smith, Marc and Barash, Vladimir and Lise Getoor and Lauw, Hady} } @unpublished {elsayed:umtr08, title = {Personal Name Resolution in Email: A Heuristic Approach}, number = {LAMP-TR150}, year = {2008}, month = {March}, publisher = {University of Maryland, College Park}, author = {Elsayed, Tamer and Oard, Doug and Namata, Galileo Mark and Lise Getoor} } @conference {zheleva:kdd07-lncs, title = {Preserving the Privacy of Sensitive Relationships in Graph Data}, booktitle = {Proceedings of the First SIGKDD International Workshop on Privacy, Security, and Trust in KDD (PinKDD 2007)}, series = {Lecture Notes in Computer Science}, volume = {4890}, year = {2008}, note = {This is an extended version of the original workshop paper.}, month = {March}, pages = {153-171}, publisher = {Springer}, organization = {Springer}, author = {Zheleva, Elena and Lise Getoor} } @article {dietterich:ml08, title = {Structured machine learning: the next ten years}, journal = {Machine Learning}, volume = {73}, number = {1}, year = {2008}, note = {Full version is available at http://dx.doi.org/10.1007/s10994-008-5079-1}, pages = {3{\textendash}23}, author = {Dietterich, Thomas and Domingos, Pedro and Lise Getoor and Muggleton, Stephen and Tadepalli, Prasad} } @article {zheleva:tois08, title = {Trusting Spam Reporters: A Reporter-based Reputation System for Email Filtering}, journal = {ACM Transactions on Information Systems}, volume = {27}, number = {1}, year = {2008}, note = {Full version in ACM library}, month = {December}, author = {Zheleva, Elena and Kolcz, Alek and Lise Getoor} } @conference {zheleva:snakdd08, title = {Using Friendship Ties and Family Circles for Link Prediction}, booktitle = {2nd ACM SIGKDD Workshop on Social Network Mining and Analysis (SNA-KDD)}, year = {2008}, author = {Zheleva, Elena and Lise Getoor and Golbeck, Jennifer and Kuter, Ugur} } @conference {kang:vast07, title = {C-GROUP: A Visual Analytic Tool for Pairwise Analysis of Dynamic Group Membership}, booktitle = {Visual Analytics Science and Technology (VAST)}, year = {2007}, author = {Kang, Hyunmo and Lise Getoor and Singh, Lisa} } @conference {islamaj:icdm07, title = {Characterizing RNA secondary-structure features and their effects on splice-site prediction}, booktitle = {IEEE ICDM Workshop on Mining and Management of Biological Data}, year = {2007}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John} } @article {bhattacharya:tkdd07, title = {Collective Entity Resolution In Relational Data}, journal = {ACM Transactions on Knowledge Discovery from Data}, volume = {1}, number = {1}, year = {2007}, month = {March}, pages = {1-36}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {bilgic:icdm07, title = {Combining Collective Classification and Link Prediction}, booktitle = {Workshop on Mining Graphs and Complex Structures at the IEEE International Conference on Data Mining (ICDM-2007)}, year = {2007}, author = {Bilgic, Mustafa and Namata, Galileo Mark and Lise Getoor} } @conference {udrea:ijcaiws07, title = {Combining statistical and logical inference for ontology alignment}, booktitle = {Workshop on Semantic Web for Collaborative Knowledge Acquisition at the International Joint Conference on Artificial Intelligence}, year = {2007}, author = {Udrea, Octavian and Lise Getoor} } @conference {namata:cikm07, title = {A Dual-View Approach to Interactive Network Visualization}, booktitle = {ACM Conference on Information and Knowledge Management}, year = {2007}, author = {Namata, Galileo Mark and Staats, Brian and Lise Getoor and Shneiderman, Ben} } @article {islamaj:bmc07, title = {Features generated for computational splice-site prediction correspond to functional elements}, journal = {BMC Bioinformatics}, volume = {8}, number = {410}, year = {2007}, note = {Electronic version is available at http://www.biomedcentral.com/1471-2105/8/410}, month = {October}, keywords = {feature generation, functional biological signals, splice-site}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John and Mount, Stephen} } @conference {kang:iv07, title = {GeoDDupe: A Novel Interface for Interactive Entity Resolution in Geospatial Data}, booktitle = {International Conference on Information Visualization}, year = {2007}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, author = {Kang, Hyunmo and Sehgal, Vivek and Lise Getoor} } @book {koller:gm-ch-srl-book07, title = {Graphical Models in a Nutshell}, series = {An Introduction to Statistical Relational Learning}, volume = {1}, year = {2007}, pages = {13--55}, publisher = {MIT Press}, organization = {MIT Press}, edition = {1}, chapter = {2}, abstract = {

Probabilistic graphical models are an elegant framework which combines uncertainty (probabilities) and logical structure (independence constraints) to compactly represent complex, real-world phenomena. The framework is quite general in that many of the commonly proposed statistical models (Kalman filters, hidden Markov models, Ising models) can be described as graphical models. Graphical models have enjoyed a surge of interest in the last two decades, due both to the flexibility and power of the representation and to the increased ability to effectively learn and perform inference in large networks.

}, author = {Daphne Koller and Nir Friedman and Lise Getoor and Benjamin Taskar}, editor = {Lise Getoor and Benjamin Taskar} } @unpublished {udrea:iswc07, title = {HOMER: Ontology Alignment Visualization and Analysis}, year = {2007}, author = {Udrea, Octavian and Lise Getoor and Miller, Renee} } @conference {udrea:iswc07-demo, title = {HOMER: Ontology visualization and analysis}, booktitle = {Demo Presentation at International Semantic Web Conference (ISWC)}, year = {2007}, author = {Udrea, Octavian and Miller, Renee and Lise Getoor} } @article {singh:de07, title = {Increasing the predictive power of affiliation networks.}, journal = {IEEE Data Engineering Bulletin}, volume = {30}, number = {2}, year = {2007}, month = {jul}, author = {Singh, Lisa and Lise Getoor} } @book {getoor:srlbook07, title = {Introduction to Statistical Relational Learning}, year = {2007}, publisher = {The MIT Press}, organization = {The MIT Press}, author = {Lise Getoor and Benjamin Taskar} } @conference {udrea:sigmod07, title = {Leveraging Data and Structure in Ontology Integration}, booktitle = {Proceedings of ACM-SIGMOD 2007 International Conference on Management}, year = {2007}, pages = {449{\textendash}460}, author = {Udrea, Octavian and Lise Getoor and Miller, Renee} } @unpublished {sen:um-tr07, title = {Link-based Classification}, number = {CS-TR-4858}, year = {2007}, month = {February}, publisher = {University of Maryland}, type = {Technical Report}, author = {Sen, Prithviraj and Lise Getoor} } @conference {bhattacharya:nectar07, title = {Online Collective Entity Resolution}, booktitle = {The 22nd National Conference on Artificial Intelligence (NECTAR Track)}, year = {2007}, publisher = {AAAI Press}, organization = {AAAI Press}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {zheleva:kdd07-wkshp, title = {Preserving the Privacy of Sensitive Relationships in Graph Data}, booktitle = {First ACM SIGKDD Workshop on Privacy, Security, and Trust in KDD (PinKDD 2007)}, year = {2007}, note = {An extended version of this paper can be found at http://linqs.cs.umd.edu/basilic/web/Publications/2008/zheleva:kdd07-lncs/}, author = {Zheleva, Elena and Lise Getoor} } @article {hung:tocl, title = {Probabilistic Interval XML}, journal = {ACM Transactions on Computational Logic (TOCL)}, year = {2007}, author = {Hung, Edward and Lise Getoor and Subrahmanian, V. S.} } @book {getoor:prm-ch-srl-book07, title = {Probabilistic Relational Models}, series = {An Introduction to Statistical Relational Learning}, volume = {1}, year = {2007}, pages = {129--174}, publisher = {MIT Press}, organization = {MIT Press}, edition = {1}, chapter = {5}, abstract = {

Probabilistic relational models (PRMs) are a rich representation language for structured statistical models. They combine a frame-based logical representation with probabilistic semantics based on directed graphical models (Bayesian networks). This chapter gives an introduction to probabilistic relational models, describing semantics for attribute uncertainty, structural uncertainty, and class uncertainty. For each case, learning algorithms and some sample results are presented.

}, author = {Lise Getoor and Nir Friedman and Daphne Koller and Avi Pfeffer and Benjamin Taskar}, editor = {Lise Getoor and Benjamin Taskar} } @article {bhattacharya:jair07, title = {Query-time Entity Resolution}, journal = {Journal of Artificial Intelligence Research (JAIR)}, volume = {30}, year = {2007}, month = {Dec}, pages = {621{\textendash}657}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {diehl:aaai07, title = {Relationship Identification for Social Network Discovery}, booktitle = {AAAI {\textquoteright}07: Proceedings of the 22nd National Conference on Artificial Intelligence}, year = {2007}, month = {July}, author = {Diehl, Christopher and Namata, Galileo Mark and Lise Getoor} } @conference {sen:dune07, title = {Representing Tuple and Attribute Uncertainty in Probabilistic Databases}, booktitle = {Workshop on Data Mining of Uncertain Data (ICDM)}, year = {2007}, author = {Sen, Prithviraj and Deshpande, Amol and Lise Getoor} } @article {islamaj:nar07, title = {SplicePort - An interactive splice-site analysis tool}, journal = {Nucleic Acids Research}, year = {2007}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John and Mount, Stephen} } @conference {bilgic:aaai07, title = {VOILA: Efficient Feature-value Acquisition for Classification}, booktitle = {AAAI {\textquoteright}07: Proceedings of the 22nd National Conference on Artificial Intelligence}, year = {2007}, month = {July}, author = {Bilgic, Mustafa and Lise Getoor} } @article {kang:kdd07, title = {Visual Analysis of Dynamic Group Membership in Temporal Social Networks}, journal = {SIGKDD Explorations, Special Issue on Visual Analytics}, volume = {9}, number = {2}, year = {2007}, month = {dec}, pages = {13-21}, author = {Kang, Hyunmo and Lise Getoor and Singh, Lisa} } @conference {singh:iv07, title = {Visual mining of multi-modal social networks at different abstraction levelsx}, booktitle = {L. Singh, M. Beard, L. Getoor, M. Blake. Visual mining of multi-modal social networks at different abstraction levels. IEEE Conference on Information Visualization - Symposium of Visual Data Mining (IV-VDM)}, year = {2007}, author = {Singh, Lisa and Beard, Mitchell and Lise Getoor and Blake, M. Brian} } @article {bhattacharya:de06, title = {Collective Entity Resolution in Relational Data}, journal = {Data Engineering Bulletin}, volume = {29}, number = {2}, year = {2006}, month = {june}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {sen:sim_lacs06, title = {Cost-Sensitive Learning with Conditional Markov Networks}, booktitle = {SIAM Data Mining Workshop on Link Analysis, Counterterrorism and Security}, year = {2006}, author = {Sen, Prithviraj and Lise Getoor} } @conference {sen:icml06, title = {Cost-Sensitive Learning with Conditional Markov Networks}, booktitle = {International Conference on Machine Learning}, year = {2006}, author = {Sen, Prithviraj and Lise Getoor} } @conference {bilgic:vast06, title = {D-Dupe: An Interactive Tool for Entity Resolution in Social Networks}, booktitle = {Visual Analytics Science and Technology (VAST)}, year = {2006}, month = {October}, address = {Baltimore}, author = {Bilgic, Mustafa and Licamele, Louis and Lise Getoor and Shneiderman, Ben} } @conference {sen:srl06, title = {Empirical Comparison of Approximate Inference Algorithms for Networked Data}, booktitle = {ICML Workshop on Statistical Relational Learning (SRL)}, year = {2006}, author = {Sen, Prithviraj and Lise Getoor} } @conference {entity-res-geodata, title = {Entity Resolution in Geospatial Data Integration}, booktitle = {ACM GIS}, year = {2006}, author = {Sehgal, Vivek and Lise Getoor and Viechnicki, Peter} } @conference {bhattacharya:sunbelt06, title = {Entity Resolution in Social Networks}, booktitle = {International Sunbelt Social Network Conference (Sunbelt XXVI)}, year = {2006}, author = {Bhattacharya, Indrajit and Lise Getoor} } @book {bhattacharya:mgd-book06, title = {Entity Resolutions in Graphs}, series = {Mining Graph Data}, volume = {1}, year = {2006}, pages = {311--344}, publisher = {Wiley}, organization = {Wiley}, edition = {1}, chapter = {13}, abstract = {

In many applications, there are a variety of ways of referring to the same underlying real-world entity. For example, J. Doe, Jonathan Doe, and Jon Doe may all refer to the same person. In addition, entity references may be linked or grouped together. For example, Jonathan Doe may be married to Jeanette Doe and may have dependents James Doe, Jason Doe, and Jacqueline Doe, and Jon Doe may be married to Jean Doe and J. Doe may have dependents Jim Doe, Jason Doe, and Jackie Doe. Given such data, we can build a graph from the entity references, where the nodes are the entity references and edges (or often hyperedges) in the graph indicate links among the references.

However, the problem is that for any real-world entity there may well be more than one node in the graph that refers to that entity. In the example above, we may have three nodes all referring to the individual Jonathan Doe, two nodes referring to Jeanette Doe, two nodes referring to each of James Doe, Jason Doe, and Jacqueline Doe. Further, because the edges are defined over entity references, rather than entities themselves, the graph does not accurately reflect the relationships between entities. For example, until we realize that Jon Doe refers to the same person as Jonathan Doe, we may not think that Jon Doe has any children, and until we realize that J. Doe refers to the same person as Jonathan Doe, we will not realize that he is married.

}, author = {Indrajit Bhattacharya and Lise Getoor}, editor = {Diane Cook and Lawrence Holder} } @conference {zhao:sna06, title = {Event Classification and Relationship Labeling in Affiliation Networks}, booktitle = {ICML Workshop on Statistical Network Analysis (SNA)}, year = {2006}, author = {Zhao, Bin and Sen, Prithviraj and Lise Getoor} } @conference {islamaj:pkdd06, title = {Feature Generation Algorithm: an Application to Splice Site Prediction}, booktitle = {Knowledge Discovery in Databases: PKDD 2006}, series = {Lecture Notes in Computer Science}, volume = {4213}, year = {2006}, month = {September}, pages = {553-560}, publisher = {Springer}, organization = {Springer}, address = {Berlin, Germany}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John} } @conference {islamaj:fsdm06, title = {A Feature Generation Algorithm for Sequences with Application to Splice Site Prediction}, booktitle = {International Workshop on Feature Selection for Data Mining (FSDM)}, year = {2006}, month = {April}, address = {Bethesda, Maryland}, author = {Islamaj, Rezarta and Lise Getoor and Wilbur, W. John} } @conference {namata:icmlws-sna06, title = {Inferring Organizational Titles in Online Communications}, booktitle = {ICML Workshop on Statistical Network Analysis}, year = {2006}, author = {Namata, Galileo Mark and Lise Getoor and Diehl, Christopher} } @article {getoor:de06, title = {An Introduction to Probabilistic Graphical Models for Relational Data}, journal = {Data Engineering Bulletin}, volume = {29}, number = {1}, year = {2006}, month = {march}, author = {Lise Getoor} } @conference {bhattacharya:sdm06, title = {A Latent Dirichlet Model for Unsupervised Entity Resolution}, booktitle = {SIAM Conference on Data Mining (SDM)}, year = {2006}, note = {Winner of the Best Paper Award.}, month = {April}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {diehl:sdm06, title = {Name Reference Resolution in Organizational Email Archives}, booktitle = {SIAM Conference on Data Mining (SDM)}, year = {2006}, author = {Diehl, Christopher and Lise Getoor and Namata, Galileo Mark} } @article {getoor:mlj06, title = {PRL: A Logical Approach to Probabilistic Relational Models}, journal = {Machine Learning Journal}, volume = {62}, number = {1-2}, year = {2006}, month = {feb}, author = {Lise Getoor and Grant, John} } @conference {licamale:icmlws-sna06, title = {Predicting Protein-Protein Interactions Using Relational Features}, booktitle = {ICML Workshop on Statistical Network Analysis}, year = {2006}, author = {Licamele, Louis and Lise Getoor} } @conference {bhattacharya:kdd06, title = {Query-Time Entity Resolution}, booktitle = {ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, year = {2006}, month = {August}, author = {Bhattacharya, Indrajit and Licamele, Louis and Lise Getoor} } @conference {bhattacharya:icml06-wkshp, title = {Relational Clustering for Entity Resolution Queries}, booktitle = {ICML Workshop on Statistical Relational Learning (SRL)}, year = {2006}, author = {Bhattacharya, Indrajit and Licamele, Louis and Lise Getoor} } @conference {licamele:icdm06, title = {Social Capital in Friendship-Event Networks}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, year = {2006}, month = {December}, author = {Licamele, Louis and Lise Getoor} } @conference {kddpanel06, title = {Is there a grand challenge or X-prize for data mining?}, booktitle = {12th International Conference on Knowledge Discovery and Data Mining}, year = {2006}, author = {Piatetsky-Shapiro, Gregory and Grossman, Robert and Djeraba, Chabane and Feldman, Ronen and Lise Getoor and Zaki, Mohammed} } @conference {desjardins:ecml06, title = {Bayesian Network Learning with Abstraction Hierarchies and Context-Specific Independence}, booktitle = {16th European Conference on Machine Learning (ECML)}, year = {2005}, author = {desJardins, Marie and Rathod, Priyang and Lise Getoor} } @conference {licamele:linkkdd05, title = {Capital and Benefit in Social Networks}, booktitle = {ACM SIGKDD Workshop on Link Analysis and Group Detection (LinkKDD)}, year = {2005}, author = {Licamele, Louis and Bilgic, Mustafa and Lise Getoor and Roussopoulos, Nick} } @conference {bilgic:gd05, title = {D-Dupe: An Interactive Tool for Entity Resolution in Social Networks}, booktitle = {International Symposium on Graph Drawing}, series = {Lecture Notes in Computer Science}, volume = {3843}, year = {2005}, month = {September}, pages = {505{\textendash}507}, publisher = {Springer}, organization = {Springer}, author = {Bilgic, Mustafa and Licamele, Louis and Lise Getoor and Shneiderman, Ben}, editor = {Patrick Healy and Nikola S. Nikolov} } @article {getoor:kdd-exp05, title = {Link Mining: A Survey}, journal = {SigKDD Explorations Special Issue on Link Mining}, volume = {7}, number = {2}, year = {2005}, month = {december}, author = {Lise Getoor and Diehl, Christopher} } @book {getoor:lbc-book-ch05, title = {Link-based Classification}, series = {Advanced Methods for Knowledge Discovery from Complex Data}, volume = {1}, year = {2005}, pages = {189--207}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, edition = {1}, chapter = {7}, abstract = {

A key challenge for machine learning is the problem of mining richly structured data sets, where the objects are linked in some way due to either an explicit or implicit relationship that exists between the objects. Links among the objects demonstrate certain patterns, which can be helpful for many machine learning tasks and are usually hard to capture with traditional statistical models. Recently there has been a surge of interest in this area, fuelled largely by interest in web and hypertext mining, but also by interest in mining social networks, bibliographic citation data, epidemiological data and other domains best described using a linked or graph structure. In this chapter we propose a framework for modeling link distributions, a link-based model that supports discriminative models describing both the link distributions and the attributes of linked objects. We use a structured logistic regression model, capturing both content and links. We systematically evaluate several variants of our link-based model on a range of data sets including both web and citation collections. In all cases, the use of the link distribution improves classification performance.

}, author = {Lise Getoor}, editor = {Ujjwal Maulik and Lawrence Holder and Diane Cook} } @conference {singh:icdm05, title = {Pruning Social Networks Using Structural Properties and Descriptive Attributes}, booktitle = {IEEE International Conference on Data Mining (ICDM)}, year = {2005}, pages = {773-776}, author = {Singh, Lisa and Lise Getoor and Licamele, Louis} } @conference {bhattacharya:kdd05-wkshp, title = {Relational Clustering for Multi-type Entity Resolution}, booktitle = {ACM SIGKDD Workshop on Multi Relational Data Mining (MRDM)}, year = {2005}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {bhattacharya:kdd04-wkshp, title = {Deduplication and Group Detection using Links}, booktitle = {ACM SIGKDD Workshop on Link Analysis and Group Detection (LinkKDD)}, year = {2004}, author = {Bhattacharya, Indrajit and Lise Getoor} } @conference {bhattacharya:sigmod04-wkshp, title = {Iterative Record Linkage for Cleaning and Integration}, booktitle = {ACM SIGMOD Workshop on Research Issues in Data Mining and Knowledge Discovery (DMKD)}, year = {2004}, author = {Bhattacharya, Indrajit and Lise Getoor} } @article {getoor:aimj04, title = {Understanding Tuberculosis Epidemiology Using Probabilistic Relational Models}, journal = {AI in Medicine Journal}, volume = {30}, year = {2004}, pages = {233-256}, author = {Lise Getoor and Rhee, Jeanne and Koller, Daphne and Small, Peter} } @conference {bhattacharya:acl04, title = {Unsupervised Sense Disambiguation using Bilingual Probabilistic Models}, booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)}, year = {2004}, month = {July}, author = {Bhattacharya, Indrajit and Lise Getoor and Bengio, Yoshua} } @conference {lerman:sigmod04, title = {Using the Structure of Web Sites for Automatic Segmentation of Tables}, booktitle = {In Proceedings of ACM-SIGMOD 2004 International Conference on Management of Data}, year = {2004}, author = {Lerman, Kristina and Lise Getoor and Minton, Steve and Knoblock, Craig} } @article {getoor:kdd-exp03, title = {Link Mining: A New Data Mining Challenge}, journal = {SIGKDD Explorations, volume}, volume = {5}, number = {1}, year = {2003}, pages = {85- -89}, author = {Lise Getoor} } @conference {lu:icml03, title = {Link-based Classification}, booktitle = {Proceedings of the International Conference on Machine Learning (ICML)}, year = {2003}, author = {Lu, Qing and Lise Getoor} } @conference {lu:icmlws03, title = {Link-based Classification Using Labeled and Unlabeled Data}, booktitle = {ICML Workshop on "The Continuum from Labeled to Unlabeled Data in Machine Learning and Data Mining}, year = {2003}, author = {Lu, Qing and Lise Getoor} } @conference {lu:ijcaiws03, title = {Link-based Text Classification}, booktitle = {IJCAI Workshop on "Text Mining and Link Analysis"}, year = {2003}, author = {Lu, Qing and Lise Getoor} } @conference {hung:icde03, title = {PXML: A Probabilistic Semistructured Data Model and Algebra}, booktitle = {Proceedings of the IEEE International Conference on Data Engineering}, year = {2003}, author = {Hung, Edward and Lise Getoor and Subrahmanian, V. S.} } @conference {hung:icdt03, title = {Probabilistic Interval XML}, booktitle = {Proceedings of the International Conference on Database Theory}, year = {2003}, author = {Hung, Edward and Lise Getoor and Subrahmanian, V. S.} } @article {getoor:de03, title = {Structure Discovery Using Statistical Relational Learning}, journal = {Data Engineering Bulletin}, volume = {26}, number = {3}, year = {2003}, pages = {11- -18}, author = {Lise Getoor} } @article {getoor:jmlr02, title = {Learning Probabilistic Models of Link Structure}, journal = {Journal of Machine Learning Research}, volume = {3}, year = {2002}, pages = {679- -707}, author = {Lise Getoor and Friedman, Nir and Koller, Daphne and Benjamin Taskar} } @article {getoor:etai02, title = {Learning Structured Statistical Models from Relational Data}, journal = {Electronic Transactions on Artificial Intelligence}, volume = {6}, number = {section B}, year = {2002}, author = {Lise Getoor and Friedman, Nir and Koller, Daphne} } @conference {getoor:icml01, title = {Learning Probabilistic Models of Relational Structure}, booktitle = {Proceedings of International Conference on Machine Learning (ICML)}, year = {2001}, author = {Lise Getoor and Friedman, Nir and Koller, Daphne and Benjamin Taskar} } @book {getoor:rdm-book01, title = {Learning Probabilistic Relational Models}, series = {Relational Data Mining}, volume = {1}, year = {2001}, pages = {307--335}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, edition = {1}, chapter = {13}, abstract = {

Probabilistic relational models (PRMs) are a language for describing statistical models over typed relational domains. A PRM models the uncertainty over the attributes of objects in the domain and uncertainty over the relations between the objects. The model specifies, for each attribute of an object, its (probabilistic) dependence on other attributes of that object and on attributes of related objects. The dependence model is defined at the level of classes of objects. The class dependence model is instantiated for any object in the class, as appropriate to the particular context of the object (i.e., the relations between this objects and others). PRMs can also represent uncertainty over the relational structure itself, e.g., by specifying a (class-level) probability that two objects will be related to each other. PRMs provide a foundation for dealing with the noise and uncertainty encountered in most real-world domains. In this chapter, we show that the compact and natural representation of PRMs allows them to be learned directly from an existing relational database using well-founded statistical techniques. We give an introduction to PRMs and an overview of methods for learning them. We show that PRMs provide a new framework for relational data mining, and offer new challenges for the endeavor of learning relational models for real-world domains.

}, author = {Lise Getoor and Nir Friedman and Daphne Koller and Avi Pfeffer}, editor = {Saso Dzeroski and Nada Lavrac} } @conference {325, title = {Learning Probabilistic Relational Models}, booktitle = {Relational Data Mining}, year = {2001}, month = {2001}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, author = {Lise Getoor and Friedman, Nir and Koller, Daphne and Pfeffer, Avi} } @mastersthesis {getoor:thesis01, title = {Learning Statistical Models from Relational Data}, year = {2001}, school = {Stanford}, type = {phd}, author = {Lise Getoor} } @conference {getoor:mrdm01, title = {Multi-relational Data Mining Using Probabilistic Models}, booktitle = {Multi-Relational Data Mining Workshop}, year = {2001}, author = {Lise Getoor} } @conference {getoor:ijcaiws01, title = {Probabilistic Models of Text and Link Structure for Hypertext Classification}, booktitle = {IJCAI Workshop on Text Learning: Beyond Supervision}, year = {2001}, author = {Lise Getoor and Segal, Eran and Benjamin Taskar and Koller, Daphne} } @conference {getoor:sigmod01, title = {Selectivity estimation using probabilistic relational models}, booktitle = {Proceedings of ACM-SIGMOD 2001 International Conference on Management of Data}, year = {2001}, author = {Lise Getoor and Koller, Daphne and Benjamin Taskar} } @conference {getoor:icmlws00, title = {From Instances to Classes in Probabilistic Relational Models}, booktitle = {Proceedings of the ICML Workshop on Attribute-Value and Relational Learning: Crossing the Boundaries}, year = {2000}, author = {Lise Getoor and Koller, Daphne and Friedman, Nir} } @conference {324, title = {From Instances to Classes in Probabilistic Relational Models}, booktitle = {Proceedings of the ICML Workshop on Attribute-Value and Relational Learning: Crossing the Boundaries}, year = {2000}, author = {Lise Getoor and Koller, Daphne and Friedman, Nir} } @conference {getoor:srl00, title = {Learning Probabilistic Relational Models with Structural Uncertainty}, booktitle = {Proceedings of the AAAI Workshop on Learning Statistical Models from Relational Data}, year = {2000}, author = {Lise Getoor and Koller, Daphne and Benjamin Taskar and Friedman, Nir} } @conference {desjardins:sara00, title = {Using Feature Hierarchies in Bayesian Network Learning}, booktitle = {Symposium on Abstraction, Reformulation and Approximation}, year = {2000}, author = {desJardins, Marie and Lise Getoor and Koller, Daphne} } @conference {friedman:aistats99, title = {Efficient Learning Using Constrained Sufficient Statistics}, booktitle = {Uncertainty99}, year = {1999}, author = {Friedman, Nir and Lise Getoor} } @conference {friedman:ijcai99, title = {Learning Probabilistic Relational Models}, booktitle = {International Joint Conference on Arti cial Intelligence}, year = {1999}, author = {Friedman, Nir and Lise Getoor and Koller, Daphne and Pfeffer, Avi} } @conference {getoor:webkdd99, title = {Using Probabilistic Relational Models for Collaborative Filtering}, booktitle = {Working Notes of the KDD Workshop on Web Usage Analysis and User Profiling}, year = {1999}, author = {Lise Getoor and Mehran Sahami} } @conference {chajewska:mdm98, title = {Using Classi cation Techniques for Utility Elicitation: A Comparison between StandardGamble and Visual Analog Scale Methods}, booktitle = {Twentieth Anniversary Meeting of the Society for Medical Decision Making}, year = {1998}, author = {Chajewska, Ursulza and Norman, Joseph and Lise Getoor} } @conference {chajewska:uai98, title = {Utility Elicitation as a Classi cation Problem}, booktitle = {Uncertainty in Arti cial Intelligence}, year = {1998}, author = {Chajewska, Ursulza and Lise Getoor and Norman, Joseph and Shahar, Yuval} } @conference {chajewska:aaaiss98, title = {Utility Elicitation as a Classification Problem}, booktitle = {Proceedings of the AAAI Spring Symposium Series on Interactive and Mixed Initiative Decision-Theoretic Systems}, year = {1998}, author = {Chajewska, Ursulza and Lise Getoor and Norman, Joseph} } @conference {getoor:aaai97, title = {Effictive Redundant Constraints for Online Scheduling}, booktitle = {Proceedings of the Fourteenth national Conference on Artificial Intelligence}, year = {1997}, author = {Lise Getoor and Ottosson, Gregor and Fromherz, Markus and Carlson, Bjorn} } @conference {getoor:aaaiws97, title = {Online Scheduling for Reprographic Machines}, booktitle = {Working notes AAAI Workshop on Online Search}, year = {1997}, author = {Lise Getoor and Fromherz, Markus} } @conference {lansky:aaiss95, title = {The Collage/Khoros Link: Planning for Image Processing Tasks}, booktitle = {Proceedings of the AAAI Spring Symposium on Integrated Planning Applications}, year = {1995}, author = {Lansky, Amy and Friedman, Mark and Lise Getoor and Schmidler, Scott and Short Jr., Nick} } @conference {lansky:ijcai95, title = {Scope and Abstraction: Two Criteria for Localized Planning}, booktitle = {Proceedings of the International Joint Conference on Arti cial Intelligence}, year = {1995}, author = {Lansky, Amy and Lise Getoor} } @conference {lansky:aaaifs94, title = {Practical Planning in COLLAGE}, booktitle = {Proceedings of the AAAI Fall Symposium on Planning and Learning: On to Real Applications}, year = {1994}, author = {Lansky, Amy and Lise Getoor} } @conference {lansky:tra94, title = {Scope and Abstraction: Two Criteria for Localized Planning}, booktitle = {Proceedings of the Workshop on Theory Reformulation and Abstraction}, year = {1994}, author = {Lansky, Amy and Lise Getoor} }