%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from numpy.random import random, normal
import pandas as pd
from scipy.io import wavfile as wv
from scipy import stats
blue, green, red, purple = sns.color_palette('deep', n_colors=4)
An important part of data analysis is data visualization. We've already been using matplotlib
to do this some, and this week we'll dig deeper into this package to understand a bit more about what it can do and how we can use it to make figures look exactly how we want them to look. The matplotlib documentation provides a huge amount of information, and this blog post discusses a useful way to go about using matplotlib (a fair amount of which we'll discuss here).
As we've seen before, there are two basic objects we will often deal with when generating a figure, the Figure
object and Axes
objects.
We'll start by making a moderately complicated signal to plot:
np.random.seed(seed=45232)
fs = 1000
t = np.arange(start=0, stop=1, step=1/fs)
y1 = .5*np.cos(2*np.pi*15*t + random()*2*np.pi)
y2 = 1.3*np.cos(2*np.pi*7*t + random()*2*np.pi)
y3 = 2*np.cos(2*np.pi*2*t + random()*2*np.pi)
s = y1 + y2 + y3 #+ n
We can create a figure and axis object with the subplots()
method in pyplot
(here plt
). Then, we can plot our signal (defined above) using the plot()
axis method, and we can adjust the limits on the axes by using the set_ylim()
and set_xlim()
axis methods.
fig, ax = plt.subplots()
ax.plot(t,s,'r')
ax.set_ylim(-10,10)
ax.set_xlim(-.5,1.5);
We can change the size of the figure when we create the figure object. We can also specify where the tick marks are on each axis and what the labels for the tick marks are, using the set_xticks()
, set_xticklabels()
, set_yticks()
, and set_yticklabels()
methods.
fig, ax = plt.subplots(1, 1, figsize=(8,4))
ax.plot(t,s,'r',lw=3)
ax.set_ylim(-4,4)
ax.set_xlim(0,1)
ax.set_xticks([0,t[np.argmax(s)],t[np.argmin(s)],1])
ax.set_xticklabels(['$t_0$','$t_{max}$','$t_{min}$','$t_n$'], fontsize=18)
ax.set_yticks(np.linspace(-4,4,5))
ax.set_yticklabels(['$-A$','$-A/2$','$0$','$+A/2$','$+A$'], fontsize=18);
We can easily write our own functions to make changes to figures if we don't like the default way that matplotlib does it. For example, here's a function for automatically adjusting axis limits based on the range of the data on each dimension and a specified 'padding' proportion on each dimension:
def adjust_limits(axis, xpad=.1, ypad=.1, adj_x=True, adj_y=True):
if adj_x:
xx = (axis.dataLim.x0,axis.dataLim.x1)
x_range = xx[1]-xx[0]
xlims = (xx[0]-xpad*x_range,xx[1]+xpad*x_range)
axis.set_xlim(xlims)
else:
xlims = None
if adj_y:
yy = (axis.dataLim.y0,axis.dataLim.y1)
y_range = yy[1]-yy[0]
ylims = (yy[0]-ypad*y_range,yy[1]+ypad*y_range)
axis.set_ylim(ylims)
else:
ylims = None
return xlims, ylims
Matplotlib allows you to annotate figures pretty easily:
fig, ax = plt.subplots(1, 1, figsize=(8,4))
ax.plot(t,np.zeros(len(t)),':',color=[.25]*4)
ax.plot(t,s,'r',lw=3)
xlims, ylims = adjust_limits(ax, xpad=0.01, ypad=.25)
ax.set_xticks(np.arange(0,1,.25))
ax.set_yticks(np.linspace(np.ceil(ylims[0]),np.floor(ylims[1]),5))
ax.set_xticks(np.linspace(0,1,5))
arrow_props = {'shrink':.05, 'width':2, 'headlength':10, 'headwidth':8, 'color':[.1,.1,.8,.75]}
max_xy = np.array([t[np.argmax(s)],np.max(s)])
#ax.annotate(s='$s_{max}$',xy=max_xy, xytext=max_xy+[.1,.5], fontsize=16, arrowprops=arrow_props)
ax.annotate(s=str(max_xy.round(2)),xy=max_xy, xytext=max_xy+[.1,.5], fontsize=14, arrowprops=arrow_props)
min_xy = np.array([t[np.argmin(s)],np.min(s)])
ax.annotate(s='$s_{min}$',xy=min_xy, xytext=min_xy+[-.15,-.25], fontsize=16, arrowprops=arrow_props);
print(max_xy)
print(min_xy)
If we have multivariate data, we can easily make a matrix of scatter plots to visualize the relationships between each pair of variables (and the distribution of each variable on its own). Here, we'll generate some random data with different magnitudes of statistical relationships between each pair:
np.random.seed(seed=3773)
n_obs = 500
var_1 = np.random.normal(loc=0,scale=1,size=n_obs)
var_2 = .75*var_1 + np.random.normal(loc=3,scale=2,size=n_obs)
var_3 = -.35*var_2 + np.random.normal(loc=-.5,scale=.5,size=n_obs)
var_4 = .5*var_3 + .5*var_1 + np.random.normal(loc=0,scale=.65,size=n_obs)
rvars = np.stack([var_1,var_2,var_3,var_4],axis=1)
nv = 4
fig, axes = plt.subplots(nrows=nv, ncols=nv, figsize=(8,8))
for i in range(nv):
for j in range(nv):
clr = [(i+j)/(2*nv),1-(i+j)/(2*nv),(i+j)/(2*nv),.5]
axt = axes[i,j]
if i==j:
axt.hist(rvars[:,i],bins=20,histtype='step', color=clr[:3], lw=2)
adjust_limits(axt, adj_y=False)
else:
axt.plot(rvars[:,i],rvars[:,j],'o',color=clr)
adjust_limits(axt)
if j != 0:
axt.set_yticks([])
if i != nv-1:
axt.set_xticks([])
Matplotlib has a large number of built-in specialty plots. For example, you can make box plots and violin plots using the boxplot()
and violinplot()
functions:
fig, axes = plt.subplots(1,2,figsize=(16,6))
ax1, ax2 = axes
ax1.boxplot(rvars, notch=True, sym='o');
ax2.violinplot(rvars, showmedians=True); ax2.set_xticks(np.arange(1,5));
If we want multiple subplots in a single figure, we are not limited to matrices of regularly-sized subplots:
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot2grid((4,4), (0,0), colspan=4)
ax2 = plt.subplot2grid((4,4), (1,0), colspan=3)
ax3 = plt.subplot2grid((4,4), (1,3), rowspan=3)
ax4 = plt.subplot2grid((4,4), (2,0))
ax5 = plt.subplot2grid((4,4), (3,0))
ax6 = plt.subplot2grid((4,4), (2,1), colspan=2, rowspan=2)
ax1.plot(t,s, lw=3)
ax2.plot(t,np.cos(s),'r--', lw=2)
ax3.plot(np.sin(2*np.pi*3*t)**5,t,'g:')
ax4.plot(t,np.sin(s),'m', lw=.5)
ax5.plot(t,s[::-1],'c')
ax6.plot(t,s**3,'k', lw=5);
As we've seen before, matplotlib makes it easy to plot a spectrogram and change the colormap
. Here we'll read in a sound file, create a bandpass filter, and illustrate the filter:
from scipy.signal import iirfilter, lfilter, freqz
fs,xx = wv.read('../w08/sentences/f0s2.wav')
nyq = fs/2
cut_freqs = np.array([1350,2350]) # Hz
cf_rad = np.array(cut_freqs/nyq)
Nord = 5
npts = 1024
b, a = iirfilter(N=Nord, Wn=cf_rad, btype='bandpass', ftype='cheby1', rp=2.5, rs=15)
w, fz = freqz(b, a, worN=npts)
fig, axes = plt.subplots(1,2,figsize=(16,4))
ax1, ax2 = axes
xl, xh = np.int(np.floor(.8*cf_rad[0]*npts)), np.int(np.floor(1.2*cf_rad[1]*npts))
ax1.plot(w[xl:xh], 20*np.log10(np.abs(fz[xl:xh])), 'm-', lw=2)
xticks = cf_rad*np.pi; adjust_limits(ax1, xpad=0)
ax1.set_xticks(xticks); ax1.set_xticklabels([np.int(nyq*i/np.pi) for i in xticks])
ax1.grid(axis='x'); ax1.set_ylabel('Amplitude (dB)'); ax1.set_xlabel('Freq (Hz)')
ax2.plot(w, 20*np.log10(np.abs(fz)), 'm-', lw=2)
ax2.set_xticks(xticks); ax2.set_xticklabels([np.int(nyq*i/np.pi) for i in xticks])
ax2.grid(axis='x'); ax2.set_xlabel('Freq (Hz)'); adjust_limits(ax2, xpad=.01, ypad=.05);
Next, we'll filter the sound and plot spectrograms of the unfiltered and filtered sounds:
x_bp = lfilter(b, a, xx)
x_arr = np.stack([xx,x_bp],axis=1)
fig, axes = plt.subplots(1,2,figsize=(15,5), sharex=True, sharey=True)
nfft = 2**10
nolp = 2**5
print(nfft,nolp)
#color_maps = pl.colormaps()
#print(color_maps)
plt.set_cmap('terrain')
for i,xt in enumerate(x_arr.T):
this_ax = axes[i]
spt = this_ax.specgram(x=xt,NFFT=nfft,Fs=fs,noverlap=nolp, scale='dB')
this_ax.set_ylim(0,6500)
With some help from numpy, we can easily make contour/mesh/etc plots, too. First, we'll use the numpy function meshgrid()
to make 2D arrays of x
and y
coordinates, then we'll create a 2D array of a complicated function of x
and y
:
L = 3
d = .15
x = np.arange(-L, L+d/2, d)
y = np.arange(-L, L+d/2, d)
X, Y = np.meshgrid(x, y)
R = np.sqrt(X**2 + Y**2)
Z = np.sin(R)**2 - .8*np.cos(Y+np.pi/7)**2 + .3*np.sin(X+np.pi/3)**2
print(X)
print('')
print(Y)
Here's a contour plot of the complicated function:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
v = np.linspace(Z.min()-.05,Z.max()+.05,20)
plt.set_cmap('cubehelix_r')
CS = ax.contour(X,Y,Z,v)
ax.clabel(CS, inline=True, fontsize=8, inline_spacing=10, fmt='%1.2f');
We can plot the same function in a 3D figure with some additional matplotlib sub-modules:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib import cm
fig = plt.figure(figsize=(15,10))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(X, Y, Z, cmap=cm.autumn, linewidth=0, antialiased=False)
ax.set_zlim(-1.0, 1.0)
ax.zaxis.set_major_locator(LinearLocator(9))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
ax.view_init(elev=45, azim=-25)
fig.colorbar(surf, shrink=0.75, aspect=10);
Of course, we can (and should) plot actual data. For example, we can read in the raw data from McMurray & Jongman (2011) What Information Is Necessary for Speech Categorization? Harnessing Variability in the Speech Signal by Integrating Cues Computed Relative to Expectations. Psychological Review, 118(2), 219-246.
df_full = pd.read_csv('mnj_raw.csv', index_col=0)
cols = ['Talker','Vowel','Fricative','F0','F1','F2','F3','rms_f','rms_v','dur_f','dur_v','M1','M2']
df = df_full[cols].dropna() # .copy() method avoids annoying warning that is irrelevant here
voiced = ['dh','v','z','zh']
voice_lab = ['voiceless','voiced']
is_voiced = np.in1d(df['Fricative'],voiced)
df['Voicing'] = [voice_lab[i] for i in is_voiced]
alveolar = ['s','z','sh','zh']
place_lab = ['labial','alveolar']
is_alveolar = np.in1d(df['Fricative'],alveolar)
df['Place'] = [place_lab[i] for i in is_alveolar]
print(df.shape)
df.head()
We can make a matrix of scatter plots to illustrate this large-ish data set:
var_names = cols[3:]
avars = df[var_names].as_matrix()
nv = len(var_names)
fig, axes = plt.subplots(nrows=nv, ncols=nv, figsize=(20,20))
for i in range(nv):
for j in range(nv):
clr = [(i+j)/(2*nv),1-(i+j)/(2*nv),1-(i+j)/(2*nv),.25]
axt = axes[i,j]
if i==j:
hh = axt.hist(avars[:,i],bins=25,histtype='step', color=clr[:3], lw=2)
xl,yl = adjust_limits(axt, adj_y=False)
yrng = hh[0].max()
xrng = xl[1]-xl[0]
axt.text(xl[0]+.05*xrng,.8*yrng,var_names[i],fontsize=18)
else:
axt.plot(avars[:,j],avars[:,i],'.',ms=12,mew=0,color=clr)
adjust_limits(axt)
if j != 0:
axt.set_yticks([])
if i != nv-1:
axt.set_xticks([])
As we discussed last week, we can also use pandas' groupby()
function (with various aggregation functions) to make custom plots to illustrate statistical patterns of interest. For example, when voicing begins in a vowel after a voiced consonant, the voicing tends to be fairly consistent from one to the other. By way of contrast, when voicing begins in a vowel following a voiceless consonant, the transition from voicelessness to voicing corresponds to a transition from holding the vocal folds rigid (and open) to holding the vocal folds close together so that they can oscillate. The rigidity during the voiceless consonant tends to produce higher rates of oscillation when voicing begins.
We can make figure illustrating average F0 values for (the vowel immediately after) voiced and voiceless fricatives for each speaker in the data set:
mu_tlk_voi = df.groupby(['Talker','Voicing']).mean()
sd_tlk_voi = df.groupby(['Talker','Voicing']).std()
fig, ax = plt.subplots(1,1,figsize=(12,6))
vd_by_tlk = mu_tlk_voi.xs('voiced',level='Voicing')['F0']
vl_by_tlk = mu_tlk_voi.xs('voiceless',level='Voicing')['F0']
vd_by_tlk_sd = sd_tlk_voi.xs('voiced',level='Voicing')['F0']
vl_by_tlk_sd = sd_tlk_voi.xs('voiceless',level='Voicing')['F0']
for i,vv in enumerate(zip(vd_by_tlk,vl_by_tlk,vd_by_tlk_sd,vl_by_tlk_sd)):
dd,ll,dd_sd,ll_sd = vv
vdl, = ax.plot([2*i,2*i],[dd-dd_sd,dd+dd_sd],'-', color=blue)
vll, = ax.plot([2*i+1,2*i+1],[ll-ll_sd,ll+ll_sd],'-', color=red)
ax.plot([2*i,2*i+1],[dd,ll],':',color=purple,lw=2)
vds, = ax.plot(np.arange(0,40,2),vd_by_tlk,'o',color=blue,ms=8)
vls, = ax.plot(np.arange(1,41,2),vl_by_tlk,'s',color=red,ms=8)
ax.tick_params(labelsize=14)
ax.set_xticks(np.arange(0.5,40.5,2))
ax.set_xticklabels(vd_by_tlk.index)
ax.set_ylabel('mean F0', fontsize=16); ax.set_xlabel('Talker', fontsize=16)
ax.legend([(vds,vdl),(vls,vll)],['Voiced','Voiceless'], fontsize=18);
If we don't want to fiddle with matplotlib, there are some other visualization packages available, too. There is a ggplot clone, a package called altair, and, as we've used repeatedly already, seaborn.
Since we've already imported seaborn, our plots are doing some seaborn-specific things. We'll play around with some seaborn settings to see what the basic options are. First, we'll define a function to illustrate multiple sinusoids in a single plot:
def sinplot(flip=1, ax=None):
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(12,6))
x = np.linspace(0, 14, 100)
for i in range(1, 7):
ax.plot(x, np.sin(x + i * .5) * (7 - i) * flip)
sns.set(style='whitegrid')
sinplot()
sns.set_style('darkgrid')
sinplot()
sns.set_style('dark')
sinplot()
sns.set_style('white')
sinplot()
sns.set_style('ticks')
sinplot()
If you want to simplify the axes and ticks, seaborn has a nice function despine()
(which we've seen before at least a couple times):
sinplot()
sns.despine(trim=True)
Seaborn works "on top of" matplotlib, so you can use both simultaneously:
fig, axes = plt.subplots(2, 2, figsize=(20,10))
axes = axes.ravel()
ctxt = ['notebook','paper','talk','poster']
for ii in range(4):
sns.set_context(ctxt[ii])
sinplot(ax=axes[ii])
axes[ii].text(0,-6,ctxt[ii], fontsize=20)
Seaborn makes it easy to fiddle with the color palette (and make fancy 2D figures):
x, y = np.random.multivariate_normal([0, 0], [[1, -.5], [-.5, 1]], size=500).T
cmap = sns.light_palette('darkgreen', as_cmap=True)
sns.set_style('whitegrid')
fig, ax = plt.subplots(1, 1, figsize=(8,8))
sns.kdeplot(x, y, cmap=cmap, shade=True, ax=ax, cut=0);
cmap = sns.light_palette('darkblue', as_cmap=True)
sns.set_style('ticks')
fig, ax = plt.subplots(1, 1, figsize=(8,8))
sns.kdeplot(df['M1'], df['M2'], cmap=cmap, shade=True, ax=ax, cut=0, cbar=True, levels=np.linspace(0,3e-7,11));
You can visualize both 2D and 1D distributions simultaneously:
sns.set_style('ticks')
sns.jointplot(df['F2'],df['F3'], size=8, kind='hex'); # kind='kde'bb
Seaborn has some nice, simple methods for making grids of plots:
sns.pairplot?
sns.set_style('darkgrid')
iris = sns.load_dataset('iris')
sns.pairplot(iris, hue='species', markers=['s','o','^']);
iris.head(n=3)
Seaborn also builds in some nice details with some basic statistical plots:
x = np.random.gamma(shape=3, scale=2, size=150)
fig, ax = plt.subplots(1, 1, figsize=(16,6))
sns.distplot(x, hist=True, kde=True, rug=True, fit=stats.gamma, ax=ax);
fig, ax = plt.subplots(1, 1, figsize=(16,6))
sns.distplot(df['F0'], hist=True, kde=True, rug=True, ax=ax);
Let's return to the McMurray & Jongman data to illustrate some (more) exploratory use of visualization with real data.
Recall that we specified two phonetic features that are likely to influence at least some of the acoustic measurements in the data set (and that we verified did influence at least one measurement).
We will use the hue
argument in the pairplot()
function to see if, and if so how, voicing and place influence a subset of the acoustic measures in the data set.
df.head()
There are good theoretical reasons to expect voicing to influence F0 (as we saw above). Voicing and place are also likely to influence rms_f (the root mean square energy in the fricative) and dur_f (the duration of the fricative), and it should also influence F1, F2, M1 (spectral mean), M2 (spectral variance), and maybe F3. We'll look at just these variables, broken up into two plots, coding voicing and place by color:
cols = ['F1','F2','F3','Voicing and Place']
mu_df = df.groupby(['Talker','Vowel','Fricative']).mean()
mu_df.reset_index(inplace=True)
is_voiced = np.in1d(mu_df['Fricative'],voiced)
mu_df['Voicing'] = [voice_lab[i] for i in is_voiced]
is_alveolar = np.in1d(mu_df['Fricative'],alveolar)
mu_df['Place'] = [place_lab[i] for i in is_alveolar]
mu_df['Voicing and Place'] = mu_df['Voicing'] + ' ' + mu_df['Place']
sns.pairplot(mu_df[cols], hue='Voicing and Place', markers=['v','^','o','s'],
hue_order=['voiceless labial','voiced labial','voiceless alveolar','voiced alveolar'],
size=5, diag_kind='kde',plot_kws={'alpha':.75});
cols = ['rms_f','dur_f','M1','M2','Voicing and Place']
sns.pairplot(mu_df[cols], hue='Voicing and Place', markers=['v','^','o','s'],
hue_order=['voiceless labial','voiced labial','voiceless alveolar','voiced alveolar'],
size=5, diag_kind='kde',plot_kws={'alpha':.75});
The package altair has a number of nice features, too, though I haven't used it as much, so I'm less familiar with it. It is designed to operate at a higher level of abstraction, and it's not built on top of matplotlib, so, unlike seaborn, you can't use matplotlib in conjunction with altair to customize plots.
Here are some altair examples taken from the documentation:
from altair import Chart, X, Y, Axis
np.random.seed(seed=5)
cats = 'ABCDE'
means = np.arange(0,10,step=2)*2 - 6
n_obs = 250
indices = np.random.randint(low=0,high=5,size=n_obs)
data = pd.DataFrame({'Category':[cats[i] for i in indices],
'Value':[np.random.normal(loc=means[i],scale=10) for i in indices]})
print(data.head())
chart = Chart(data)
#chart.mark_bar().encode(y='a',x='mean(b)')
chart.mark_square(color='darkgreen').encode(
X('Category',type='nominal',axis=Axis(title='Category')),
Y('Value',type='quantitative')#,aggregate='mean',axis=Axis(title='Mean Value'))
)
from altair import Bin, Color, Scale, expr, load_dataset
df_w = load_dataset('seattle-weather')
df_w.head()
df_w.tail()
ch_w = Chart(df_w)
ch_w.mark_bar().encode(X('precipitation',bin=Bin(min=0,max=60,step=2)),Y('count(*):Q'))
scale = Scale(domain=['sun', 'fog', 'drizzle', 'rain', 'snow'],
range=['#e7ba52', '#c7c7c7', '#aec7e8', '#1f77b4', '#9467bd'])
df_w['temp_range'] = df_w.temp_max-df_w.temp_min
ch_w.mark_line(color='darkred',lineSize=3).encode(
X('date:T', timeUnit='month'),
Y('mean(precipitation)')
)
ch_w.mark_line().encode(X('date:T', timeUnit='yearmonth'),Y('max(wind)'))
ch_w.mark_line().encode(X('date:T', timeUnit='yearmonth'),Y('mean(temp_range)'))
ch_w.mark_line(color='purple',lineSize=3).encode(
X('date:T', timeUnit='month'),
Y('max(temp_min)')
)
ch_w.mark_bar().encode(
X('date:T', timeUnit='yearmonth'),
Y('count(*)'),
Color('weather', scale=scale),
)
ch_w.mark_point().encode(
X('temp_max', axis=Axis(title='Maximum Daily Temperature (C)')),
Y('temp_range', axis=Axis(title='Daily Temperature Range (C)')),
Color('weather', scale=scale),
size='precipitation',
).configure_cell(width=800, height=600)