@conference {352, title = {Identifying Facet Mismatches In Search Via Micrographs}, booktitle = {International Conference on Information and Knowledge Management (CIKM)}, year = {2019}, abstract = {E-commerce search engines are the primary means by which customers shop for products online. Each customer query contains multiple facets such as product type, color, brand, etc. A successful search engine retrieves products that are relevant to the query along each of these attributes. However, due to lexical (erroneous title, description, etc.) and behavioral irregularities (clicks or purchases of products that do not belong to the same facet as the query), some mismatched products are shown in the search results. These irregularities are often detected using simple binary classifiers like gradient boosted decision trees or logistic regression. Typically, these binary classifiers use strong independence assumptions between the samples and ignore structural relationships available in the data, such as the connections between products and queries. In this paper, we use the connections that exist between products and query to identify a special kind of structure we refer to as a micrograph. Further, we make use of Statistical Relational Learning (SRL) to incorporate these micrographs in the data and pose the problem as a structured prediction problem. We refer to this approach as structured mismatch classification (smc). In addition, we show that naive addition of structure does not improve the performance of the model and hence introduce a variation of smc, strong smc (s2mc), which improves over the baseline by passing information from high-confidence predictions to lower confidence predictions. In our empirical evaluation we show that our proposed approach outperforms the baseline classification methods by up to 12\% in precision. Furthermore, we use quasi-Newton methods to make our method viable for real-time inference in a search engine and show that our approach is up to 150 times faster than existing ADMM-based solvers.}, keywords = {collective classification, defect, probabilistic soft logic, search, statistical relational language, structured prediction}, author = {Sriram Srinivasan and Nikhil S Rao and Karthik Subbaian and Lise Getoor} } @article {343, title = {Interpretable Engagement Models for MOOCs using Hinge-loss Markov Random Fields}, journal = {IEEE Transactions on Learning Technologies (TLT)}, volume = {14}, year = {2019}, pages = {1-1}, chapter = {1}, abstract = {Maintaining and cultivating student engagement is critical for learning. Understanding factors affecting student engagement can help in designing better courses and improving student retention. The large number of participants in massive open online courses (MOOCs) and data collected from their interactions on the MOOC open up avenues for studying student engagement at scale. In this work, we develop an interpretable statistical relational learning model for understanding student engagement in online courses using a complex combination of behavioral, linguistic, structural, and temporal cues. We show how to abstract student engagement types of active, passive, and disengagement as meaningful latent variables using logical rules in our model connecting student behavioral signals with student success in MOOCs. We demonstrate that the latent formulation for engagement helps in predicting two measures of student success: performance, their final grade in the course, and survival, their continued presence in the course till the end, across seven MOOCs. Further, in order to initiate better instructor interventions, we need to be able to predict student success early in the course. We demonstrate that we can predict student success early in the course reliably using the latent model. We also demonstrate the utility of our models in predicting student success in new courses, by training our models on one course and testing on another course. We show that the latent abstractions are helpful in predicting student success and engagement reliably in new MOOCs that haven{\textquoteright}t yet gathered student interaction data. We then perform a closer quantitative analysis of different features derived from student interactions on the MOOC and identify student activities that are good indicators of student success at different points in the course. Through a qualitative analysis of the latent engagement variable values, we demonstrate their utility in understanding students{\textquoteright} engagement levels at various points in the course and movement of students across different types of engagement.}, author = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal Daume III and Lise Getoor} } @mastersthesis {namata:phdthesis12, title = {Identifying Graphs from Noisy Observational Data}, year = {2012}, month = {May}, school = {University of Maryland - College Park}, type = {phd}, author = {Namata, Galileo Mark} } @conference {london:nips12asalsn, title = {Improved Generalization Bounds for Large-scale Structured Prediction}, booktitle = {NIPS Workshop on Algorithmic and Statistical Approaches for Large Social Networks}, year = {2012}, author = {London, Ben and Huang, Bert and Lise Getoor} } @conference {minton:cmla11, title = {Improving Classifier Performance by Autonomously Collecting Background Knowledge from the Web}, booktitle = {Tenth International Conference on Machine Learning and Applications}, year = {2011}, author = {Minton, Steve and Michelson, Matthew and See, Kane and Macskassy, Sofus and Gazen, Bora C. and Lise Getoor} } @article {licamele:bmcbio10, title = {Indirect two-sided relative ranking: a robust similarity measure for gene expression data}, journal = {BMC Bioinformatics}, year = {2010}, keywords = {gene expression bioinformatics drug therapeutics}, author = {Licamele, Louis and Lise Getoor} } @mastersthesis {bilgic:phdthesis10, title = {Information Acquisition in Structured Domains}, year = {2010}, month = {August}, school = {University of Maryland - College Park}, type = {phd}, author = {Bilgic, Mustafa} } @conference {namata:kddu09, title = {Identifying Graphs From Noisy and Incomplete Data}, booktitle = {1st ACM SIGKDD Workshop on Knowledge Discovery from Uncertain Data}, year = {2009}, author = {Namata, Galileo Mark and Lise Getoor} } @conference {schnaitter:vldb09, title = {Index Interactions in Physical Design Tuning: Modeling, Analysis, and Applications}, booktitle = {International Conference on Very Large Data Bases}, year = {2009}, author = {Schnaitter, Karl and Polyzotis, Neoklis and Lise Getoor} } @article {kang:tvcg08, title = {Interactive Entity Resolution in Relational Data: A Visual Analytic Tool and Its Evaluation}, journal = {IEEE Transactions on Visualization and Computer Graphics}, volume = {14}, number = {5}, year = {2008}, pages = {999{\textendash}1014}, author = {Kang, Hyunmo and Lise Getoor and Shneiderman, Ben and Bilgic, Mustafa and Licamele, Louis} } @article {singh:de07, title = {Increasing the predictive power of affiliation networks.}, journal = {IEEE Data Engineering Bulletin}, volume = {30}, number = {2}, year = {2007}, month = {jul}, author = {Singh, Lisa and Lise Getoor} } @book {getoor:srlbook07, title = {Introduction to Statistical Relational Learning}, year = {2007}, publisher = {The MIT Press}, organization = {The MIT Press}, author = {Lise Getoor and Benjamin Taskar} } @conference {namata:icmlws-sna06, title = {Inferring Organizational Titles in Online Communications}, booktitle = {ICML Workshop on Statistical Network Analysis}, year = {2006}, author = {Namata, Galileo Mark and Lise Getoor and Diehl, Christopher} } @article {getoor:de06, title = {An Introduction to Probabilistic Graphical Models for Relational Data}, journal = {Data Engineering Bulletin}, volume = {29}, number = {1}, year = {2006}, month = {march}, author = {Lise Getoor} } @conference {bhattacharya:sigmod04-wkshp, title = {Iterative Record Linkage for Cleaning and Integration}, booktitle = {ACM SIGMOD Workshop on Research Issues in Data Mining and Knowledge Discovery (DMKD)}, year = {2004}, author = {Bhattacharya, Indrajit and Lise Getoor} }