The Setup at usesthis.com finds out what people use to get their job done. This is an effort to try to read and analyze that data.
The base dir is the location where the github url of the usesthis repo is checked out.
import pandas as pd
import os, markdown, glob, yaml
from re import match, findall, DOTALL
base_dir = '/Users/lpd/gh/usesthis/'
interview_files = glob.glob('%s/_posts/*'%base_dir)
all_interviews = []
for interview_file_name in interview_files:
with open(interview_file_name,'r') as interview_file:
md_string = interview_file.read()
yml_header = match('(?P<yml>^---.*)\n---', md_string, DOTALL).groupdict()['yml']
interview = yaml.load(yml_header)
interview['wares'] = [wares[1] for wares in findall('\[([^\[\(\)]+)\]\[([a-z0-9\.\-]+)?\]', md_string) if wares[1]]
all_interviews.append(interview)
interview_frame = pd.DataFrame(all_interviews)
del interview_frame['layout']
del interview_frame['credits']
del interview_frame['summary']
del interview_frame['title']
each_used_ware = []
for person_ware in interview_frame.wares:
each_used_ware+=person_ware
ps_wares = pd.Series(each_used_ware)
ps_wares.value_counts()
ps_wares.value_counts()[:15].plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x108d3e690>
interview_frame.ix[:10]
<class 'pandas.core.frame.DataFrame'> Int64Index: 11 entries, 0 to 10 Data columns (total 4 columns): categories 11 non-null values license 0 non-null values slug 11 non-null values wares 11 non-null values dtypes: object(4)
interview_frame
<class 'pandas.core.frame.DataFrame'> Int64Index: 374 entries, 0 to 373 Data columns (total 4 columns): categories 374 non-null values license 1 non-null values slug 374 non-null values wares 374 non-null values dtypes: object(4)
len(ps_wares)
5071
iv_cats = interview_frame.categories
all_cats = []
for el in iv_cats:
all_cats+=el
all_cats[:10]
['developer', 'mac', 'web', 'mac', 'musician', 'game', 'suit', 'windows', 'developer', 'mac']
set_cats = set(all_cats)
len(all_cats)
1260
len(set_cats)
72
ps_all_cats = pd.Series(all_cats)
ps_all_cats.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x108d85ad0>
ps_all_cats.value_counts()[-25:].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x108da9390>
set_wares = set(ps_wares)
len(set_wares)
2236
wares = glob.glob('%s/wares/hardware/*'%base_dir)
hardware = []
for el in wares:
desc = yaml.load(open(el))
desc['title'] = os.path.splitext(os.path.basename(el))[0]
hardware.append(desc)
wares = glob.glob('%s/wares/software/*'%base_dir)
software = []
for el in wares:
desc = yaml.load(open(el))
desc['title'] = os.path.splitext(os.path.basename(el))[0]
software.append(desc)
ps_wares
0 macbook-pro 1 cinema-display 2 readynas-nv-plus 3 time-machine 4 airport-express 5 safari-adblock 6 google-calendar 7 istat-pro 8 google-apps 9 review-board 10 bloom-ios 11 google-reader 12 plan-9 13 audio-kontrol-1 14 powershot-sd1000 ... 5056 expansion-portable 5057 mac-os-x 5058 thinkpad-t410s 5059 thinkpad-t410 5060 iphone-5 5061 vmware-fusion 5062 chrome 5063 subversion 5064 macbook-pro 5065 google-docs 5066 tweetbot-ios 5067 live 5068 final-cut-pro 5069 creative-suite 5070 macbook-pro Length: 5071, dtype: object
pf_used_ware = pd.DataFrame(ps_wares)
pf_used_ware
<class 'pandas.core.frame.DataFrame'> Int64Index: 5071 entries, 0 to 5070 Data columns (total 1 columns): 0 5071 non-null values dtypes: object(1)
pf_used_ware.columns = ['title']
pf_used_ware
<class 'pandas.core.frame.DataFrame'> Int64Index: 5071 entries, 0 to 5070 Data columns (total 1 columns): title 5071 non-null values dtypes: object(1)
software[:2]
[{'description': 'An audio plugin that emulates the 1176 limiter.', 'name': '1176 (plugin)', 'title': '1176', 'url': 'http://www.uaudio.com/store/compressors-limiters/1176-collection.html'}, {'description': 'A web service for uploading videos from mobile phones or webcams.', 'name': '12seconds', 'title': '12seconds', 'url': 'http://12seconds.tv/'}]
for el in software:
el['type'] = 'software'
for el in hardware:
el['type'] = 'hardware'
ware = hardware+software
ware[:3]
[{'description': 'A very powerful scanner.', 'name': '10000XL', 'title': '10000xl', 'type': 'hardware', 'url': 'http://www.epson.com/cgi-bin/Store/jsp/Product.do?sku=E10000XL-PH'}, {'description': 'Old speakers.', 'name': '104aB', 'title': '104ab', 'type': 'hardware', 'url': 'http://www.kef.com/html/us/explore/about_kef/museum/1970s/104aBKIT_CantataKIT/index.html'}, {'description': 'A subwoofer.', 'name': '10S', 'title': '10s', 'type': 'hardware', 'url': 'http://www.shopping.com/KRK-KRK-10S-Subwoofer/info?sb=1'}]
len(ware)
3326
len(hardware)
1373
len(software)
1953
1953+1373
3326
extracted_ware = pd.DataFrame(ware)
user_wares = pf_used_ware.merge(extracted_ware,on='title',how='inner')
user_wares[:20]
<class 'pandas.core.frame.DataFrame'> Int64Index: 20 entries, 0 to 19 Data columns (total 5 columns): title 20 non-null values description 20 non-null values name 20 non-null values type 20 non-null values url 20 non-null values dtypes: object(5)
user_wares.ix[user_wares.type=='hardware'].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109cf5e90>
user_wares.ix[user_wares.type=='software'].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109d3f450>
user_wares.ix[user_wares.description.str.contains('headphone')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109f37650>
user_wares.ix[user_wares.description.str.contains('keyboard')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109fdd0d0>
user_wares.ix[user_wares.description.str.contains('display')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a2394d0>
user_wares.ix[user_wares.description.str.contains('phone')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a2d38d0>
user_wares.ix[user_wares.description.str.contains('text')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a485290>
user_wares.ix[user_wares.description.str.contains('backup')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a8011d0>
user_wares.ix[user_wares.description.str.contains('disk')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a818690>
user_wares.ix[user_wares.description.str.contains('storage')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10ac8d8d0>
user_wares.ix[user_wares.description.str.contains('server')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10aca4310>
user_wares.ix[user_wares.description.str.contains('chair')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10ace0f10>
user_wares.ix[user_wares.description.str.contains('tablet')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b0286d0>
user_wares.ix[user_wares.description.str.contains('console')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b37e950>
user_wares.ix[user_wares.description.str.contains('terminal')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b394e50>
user_wares.ix[user_wares.description.str.contains('pen')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b3c9e90>
user_wares.ix[user_wares.description.str.contains('printer')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b8183d0>
user_wares.ix[user_wares.description.str.contains('scanner')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b8be390>
whos
Variable Type Data/Info -------------------------------------------- DOTALL int 16 all_cats list n=1260 all_interviews list n=374 base_dir str /Users/lpd/gh/usesthis/ desc dict n=5 each_used_ware list n=5071 el dict n=5 extracted_ware DataFrame <class 'pandas.core.frame<...>values\ndtypes: object(5) findall function <function findall at 0x100467f50> glob module <module 'glob' from '/Use<...>/lib/python2.7/glob.pyc'> hardware list n=1373 interview dict n=6 interview_file file <closed file '/Users/lpd/<...> mode 'r' at 0x1087ffdb0> interview_file_name str /Users/lpd/gh/usesthis//_<...>9-jason.forrest.interview interview_files list n=374 interview_frame DataFrame <class 'pandas.core.frame<...>values\ndtypes: object(4) iv_cats Series 0 [d<...>ength: 374, dtype: object markdown module <module 'markdown' from '<...>s/markdown/__init__.pyc'> match function <function match at 0x100467cf8> md_string str ---\nlayout: interview\ns<...>ed in generational terms. os module <module 'os' from '/Users<...>da/lib/python2.7/os.pyc'> pd module <module 'pandas' from '/U<...>ges/pandas/__init__.pyc'> person_ware list n=4 pf_used_ware DataFrame <class 'pandas.core.frame<...>values\ndtypes: object(1) ps_all_cats Series 0 developer\n1 <...>ngth: 1260, dtype: object ps_wares Series 0 macbook-pro\n1<...>ngth: 5071, dtype: object set_cats set set(['crafter', 'biker', <...>', 'baker', 'usability']) set_wares set set(['apex-model-edge-pro<...>-quadra', 'path-finder']) software list n=1953 user_wares DataFrame <class 'pandas.core.frame<...>values\ndtypes: object(5) ware list n=3326 wares list n=1953 yaml module <module 'yaml' from '/Use<...>kages/yaml/__init__.pyc'> yml_header str ---\nlayout: interview\ns<...>ncategories: [mac, music]
user_wares
<class 'pandas.core.frame.DataFrame'> Int64Index: 5070 entries, 0 to 5069 Data columns (total 5 columns): title 5070 non-null values description 5070 non-null values name 5070 non-null values type 5070 non-null values url 5070 non-null values dtypes: object(5)