Note
This script is based heavily on Jonathan Taylor’s class notes http://www.stanford.edu/class/stats191/interactions.html
In [1]: from urllib2 import urlopen
In [1]: import numpy as np
In [1]: import pandas
In [1]: import matplotlib.pyplot as plt
In [1]: from statsmodels.formula.api import ols
In [1]: from statsmodels.graphics.api import interaction_plot, abline_plot
In [1]: from statsmodels.stats.anova import anova_lm
load data
In [1]: try:
...: salary_table = pandas.read_csv('./salary.table')
...: except:
...: print "fetching from website"
...: url = 'http://stats191.stanford.edu/data/salary.table'
...: url = urlopen(url)
...: salary_table = pandas.read_table(url)
...: salary_table.to_csv('salary.table', index=False)
...:
In [1]: E = salary_table.E
In [1]: M = salary_table.M
In [1]: X = salary_table.X
In [1]: S = salary_table.S
Take a look at the data
In [1]: plt.figure(figsize=(6, 6));
In [1]: symbols = ['D', '^']
In [1]: colors = ['r', 'g', 'blue']
In [1]: factor_groups = salary_table.groupby(['E', 'M'])
In [1]: for values, group in factor_groups:
...: i, j = values
...: plt.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i - 1],
...: s=144)
...:
In [1]: plt.xlabel('Experience');
In [1]: plt.ylabel('Salary');
Fit a linear model
In [1]: formula = 'S ~ C(E) + C(M) + X'
In [1]: lm = ols(formula, salary_table).fit()
In [1]: print lm.summary()
Have a look at the created design matrix
In [1]: lm.model.exog[:20]
Or since we initially passed in a DataFrame, we have a DataFrame available in
In [1]: lm.model.data.orig_exog
We keep a reference to the original untouched data in
In [1]: lm.model.data.frame
Get influence statistics
In [1]: infl = lm.get_influence()
In [1]: print infl.summary_table()
or get a dataframe
In [1]: df_infl = infl.summary_frame()
Now plot the reiduals within the groups separately
In [1]: resid = lm.resid
In [1]: plt.figure(figsize=(6, 6));
In [1]: for values, group in factor_groups:
...: i, j = values
...: group_num = i * 2 + j - 1 # for plotting purposes
...: x = [group_num] * len(group)
...: plt.scatter(x, resid[group.index], marker=symbols[j], color=colors[i - 1],
...: s=144, edgecolors='black')
...:
In [1]: plt.xlabel('Group');
In [1]: plt.ylabel('Residuals');
now we will test some interactions using anova or f_test
In [1]: interX_lm = ols("S ~ C(E) * X + C(M)", salary_table).fit()
In [1]: print interX_lm.summary()
Do an ANOVA check
In [1]: table1 = anova_lm(lm, interX_lm)
In [1]: print table1
In [1]: interM_lm = ols("S ~ X + C(E)*C(M)", data=salary_table).fit()
In [1]: print interM_lm.summary()
In [1]: table2 = anova_lm(lm, interM_lm)
In [1]: print table2
The design matrix as a DataFrame
In [1]: interM_lm.model.data.orig_exog
The design matrix as an ndarray
In [1]: interM_lm.model.exog
In [1]: interM_lm.model.exog_names
In [1]: infl = interM_lm.get_influence()
In [1]: resid = infl.resid_studentized_internal
In [1]: plt.figure(figsize=(6, 6));
In [1]: for values, group in factor_groups:
...: i, j = values
...: idx = group.index
...: plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i - 1],
...: s=144, edgecolors='black')
...:
In [1]: plt.xlabel('X');
In [1]: plt.ylabel('standardized resids');
Looks like one observation is an outlier. TODO: do we have Bonferonni outlier test?
In [1]: drop_idx = abs(resid).argmax()
In [1]: print drop_idx # zero-based index
In [1]: idx = salary_table.index.drop([drop_idx])
In [1]: lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit()
In [1]: print lm32.summary()
In [1]: interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit()
In [1]: print interX_lm32.summary()
In [1]: table3 = anova_lm(lm32, interX_lm32)
In [1]: print table3
In [1]: interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit()
In [1]: table4 = anova_lm(lm32, interM_lm32)
In [1]: print table4
Replot the residuals
In [1]: try:
...: resid = interM_lm32.get_influence().summary_frame()['standard_resid']
...: except:
...: resid = interM_lm32.get_influence().summary_frame()['standard_resid']
...:
In [1]: plt.figure(figsize=(6, 6));
In [1]: for values, group in factor_groups:
...: i, j = values
...: idx = group.index
...: plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i - 1],
...: s=144, edgecolors='black')
...:
In [1]: plt.xlabel('X[~[32]]');
In [1]: plt.ylabel('standardized resids');
Plot the fitted values
In [1]: lm_final = ols('S ~ X + C(E)*C(M)',
...: data=salary_table.drop([drop_idx])).fit()
...:
In [1]: mf = lm_final.model.data.orig_exog
In [1]: lstyle = ['-', '--']
In [1]: plt.figure(figsize=(6, 6));
In [1]: for values, group in factor_groups:
...: i, j = values
...: idx = group.index
...: plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i - 1],
...: s=144, edgecolors='black')
...: plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),
...: ls=lstyle[j], color=colors[i - 1])
...:
In [1]: plt.xlabel('Experience');
In [1]: plt.ylabel('Salary');
From our first look at the data, the difference between Master’s and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management, M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot.
In [1]: U = S - X * interX_lm32.params['X']
In [1]: plt.figure(figsize=(6, 6));
In [1]: interaction_plot(E, M, U, colors=['red', 'blue'], markers=['^', 'D'],
...: markersize=10, ax=plt.gca())
...:
In [1]: try:
...: minority_table = pandas.read_table('minority.table')
...: except: # don't have data already
...: url = 'http://stats191.stanford.edu/data/minority.table'
...: url = urlopen(url)
...: minority_table = pandas.read_table(url)
...:
In [1]: factor_group = minority_table.groupby(['ETHN'])
In [1]: plt.figure(figsize=(6, 6));
In [1]: colors = ['purple', 'green']
In [1]: markers = ['o', 'v']
In [1]: for factor, group in factor_group:
...: plt.scatter(group['TEST'], group['JPERF'], color=colors[factor],
...: marker=markers[factor], s=12**2)
...:
In [1]: plt.xlabel('TEST');
In [1]: plt.ylabel('JPERF');
In [1]: min_lm = ols('JPERF ~ TEST', data=minority_table).fit()
In [1]: print min_lm.summary()
In [1]: plt.figure(figsize=(6, 6));
In [1]: for factor, group in factor_group:
...: plt.scatter(group['TEST'], group['JPERF'], color=colors[factor],
...: marker=markers[factor], s=12**2)
...:
In [1]: plt.xlabel('TEST')
In [1]: plt.ylabel('JPERF')
In [1]: abline_plot(model_results=min_lm, ax=plt.gca())
In [1]: min_lm2 = ols('JPERF ~ TEST + TEST:ETHN',
...: data=minority_table).fit()
...:
In [1]: print min_lm2.summary()
In [1]: plt.figure(figsize=(6, 6));
In [1]: for factor, group in factor_group:
...: plt.scatter(group['TEST'], group['JPERF'], color=colors[factor],
...: marker=markers[factor], s=12**2)
...:
In [1]: abline_plot(intercept=min_lm2.params['Intercept'],
...: slope=min_lm2.params['TEST'], ax=plt.gca(), color='purple')
...:
In [1]: abline_plot(intercept=min_lm2.params['Intercept'],
...: slope=min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],
...: ax=plt.gca(), color='green')
...:
In [1]: min_lm3 = ols('JPERF ~ TEST + ETHN', data=minority_table).fit()
In [1]: print min_lm3.summary()
In [1]: plt.figure(figsize=(6, 6));
In [1]: for factor, group in factor_group:
...: plt.scatter(group['TEST'], group['JPERF'], color=colors[factor],
...: marker=markers[factor], s=12**2)
...:
In [1]: abline_plot(intercept=min_lm3.params['Intercept'],
...: slope=min_lm3.params['TEST'], ax=plt.gca(), color='purple')
...:
In [1]: abline_plot(intercept=min_lm3.params['Intercept'] + min_lm3.params['ETHN'],
...: slope=min_lm3.params['TEST'], ax=plt.gca(), color='green')
...:
In [1]: min_lm4 = ols('JPERF ~ TEST * ETHN', data=minority_table).fit()
In [1]: print min_lm4.summary()
In [1]: plt.figure(figsize=(6, 6));
In [1]: for factor, group in factor_group:
...: plt.scatter(group['TEST'], group['JPERF'], color=colors[factor],
...: marker=markers[factor], s=12**2)
...:
In [1]: abline_plot(intercept=min_lm4.params['Intercept'],
...: slope=min_lm4.params['TEST'], ax=plt.gca(), color='purple')
...:
In [1]: abline_plot(intercept=min_lm4.params['Intercept'] + min_lm4.params['ETHN'],
...: slope=min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'],
...: ax=plt.gca(), color='green')
...:
is there any effect of ETHN on slope or intercept
In [1]: table5 = anova_lm(min_lm, min_lm4)
In [1]: print table5
is there any effect of ETHN on intercept
In [1]: table6 = anova_lm(min_lm, min_lm3)
In [1]: print table6
is there any effect of ETHN on slope
In [1]: table7 = anova_lm(min_lm, min_lm2)
In [1]: print table7
is it just the slope or both?
In [1]: table8 = anova_lm(min_lm2, min_lm4)
In [1]: print table8
In [1]: try:
...: rehab_table = pandas.read_csv('rehab.table')
...: except:
...: url = 'http://stats191.stanford.edu/data/rehab.csv'
...: url = urlopen(url)
...: rehab_table = pandas.read_table(url, delimiter=",")
...: rehab_table.to_csv('rehab.table')
...:
In [1]: plt.figure(figsize=(6, 6));
In [1]: rehab_table.boxplot('Time', 'Fitness', ax=plt.gca())
In [1]: rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit()
In [1]: table9 = anova_lm(rehab_lm)
In [1]: print table9
In [1]: print rehab_lm.model.data.orig_exog
In [1]: print rehab_lm.summary()
In [1]: try:
...: kidney_table = pandas.read_table('./kidney.table')
...: except:
...: url = 'http://stats191.stanford.edu/data/kidney.table'
...: url = urlopen(url)
...: kidney_table = pandas.read_table(url, delimiter=" *")
...:
Explore the dataset
In [1]: print kidney_table.groupby(['Weight', 'Duration']).size()
balanced panel
In [1]: kt = kidney_table
In [1]: plt.figure(figsize=(6, 6));
In [1]: interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days'] + 1),
...: colors=['red', 'blue'], markers=['D', '^'], ms=10,
...: ax=plt.gca())
...:
You have things available in the calling namespace available in the formula evaluation namespace
In [1]: kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit()
In [1]: table10 = anova_lm(kidney_lm)
In [1]: print anova_lm(ols('np.log(Days+1) ~ C(Duration) + C(Weight)',
...: data=kt).fit(), kidney_lm)
...:
In [1]: print anova_lm(ols('np.log(Days+1) ~ C(Duration)', data=kt).fit(),
...: ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',
...: data=kt).fit())
...:
In [1]: print anova_lm(ols('np.log(Days+1) ~ C(Weight)', data=kt).fit(),
...: ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',
...: data=kt).fit())
...:
Illustrates the use of different types of sums of squares (I,II,II) and how the Sum contrast can be used to produce the same output between the 3.
Types I and II are equivalent under a balanced design.
Don’t use Type III with non-orthogonal contrast - ie., Treatment
In [1]: sum_lm = ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Sum)',
...: data=kt).fit()
...:
In [1]: print anova_lm(sum_lm)
In [1]: print anova_lm(sum_lm, typ=2)
In [1]: print anova_lm(sum_lm, typ=3)
In [1]: nosum_lm = ols('np.log(Days+1) ~ C(Duration, Treatment) * C(Weight, Treatment)',
...: data=kt).fit()
...:
In [1]: print anova_lm(nosum_lm)
In [1]: print anova_lm(nosum_lm, typ=2)
In [1]: print anova_lm(nosum_lm, typ=3)