The Setup at usesthis.com finds out what people use to get their job done. This is an effort to try to read and analyze that data.
The base dir is the location where the github url of the usesthis repo is checked out.
import pandas as pd
import os, markdown, glob, yaml
from re import match, findall, DOTALL
base_dir = '/Users/lpd/gh/usesthis/'
interview_files = glob.glob('%s/_posts/*'%base_dir)
all_interviews = []
for interview_file_name in interview_files:
with open(interview_file_name,'r') as interview_file:
md_string = interview_file.read()
yml_header = match('(?P<yml>^---.*)\n---', md_string, DOTALL).groupdict()['yml']
interview = yaml.load(yml_header)
interview['wares'] = [wares[1] for wares in findall('\[([^\[\(\)]+)\]\[([a-z0-9\.\-]+)?\]', md_string) if wares[1]]
all_interviews.append(interview)
interview_frame = pd.DataFrame(all_interviews)
del interview_frame['layout']
del interview_frame['credits']
del interview_frame['summary']
del interview_frame['title']
each_used_ware = []
for person_ware in interview_frame.wares:
each_used_ware+=person_ware
ps_wares = pd.Series(each_used_ware)
ps_wares.value_counts()
ps_wares.value_counts()[:15].plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x108af38d0>
interview_frame.ix[:10]
<class 'pandas.core.frame.DataFrame'> Int64Index: 11 entries, 0 to 10 Data columns (total 4 columns): categories 11 non-null values license 0 non-null values slug 11 non-null values wares 11 non-null values dtypes: object(4)
interview_frame
<class 'pandas.core.frame.DataFrame'> Int64Index: 373 entries, 0 to 372 Data columns (total 4 columns): categories 373 non-null values license 1 non-null values slug 373 non-null values wares 373 non-null values dtypes: object(4)
len(ps_wares)
5066
iv_cats = interview_frame.categories
all_cats = []
for el in iv_cats:
all_cats+=el
all_cats[:10]
['developer', 'mac', 'web', 'mac', 'musician', 'game', 'suit', 'windows', 'developer', 'mac']
set_cats = set(all_cats)
len(all_cats)
1258
len(set_cats)
72
ps_all_cats = pd.Series(all_cats)
ps_all_cats.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x108c45590>
ps_all_cats.value_counts()[-25:].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x108c68d90>
set_wares = set(ps_wares)
len(set_wares)
2236
wares = glob.glob('%s/wares/hardware/*'%base_dir)
hardware = []
for el in wares:
desc = yaml.load(open(el))
desc['title'] = os.path.splitext(os.path.basename(el))[0]
hardware.append(desc)
wares = glob.glob('%s/wares/software/*'%base_dir)
software = []
for el in wares:
desc = yaml.load(open(el))
desc['title'] = os.path.splitext(os.path.basename(el))[0]
software.append(desc)
ps_wares
0 macbook-pro 1 cinema-display 2 readynas-nv-plus 3 time-machine 4 airport-express 5 safari-adblock 6 google-calendar 7 istat-pro 8 google-apps 9 review-board 10 bloom-ios 11 google-reader 12 plan-9 13 audio-kontrol-1 14 powershot-sd1000 ... 5051 final-cut-pro 5052 macbook-pro 5053 macbook-air 5054 27q 5055 expansion-portable 5056 mac-os-x 5057 thinkpad-t410s 5058 thinkpad-t410 5059 iphone-5 5060 vmware-fusion 5061 chrome 5062 subversion 5063 macbook-pro 5064 google-docs 5065 tweetbot-ios Length: 5066, dtype: object
pf_used_ware = pd.DataFrame(ps_wares)
pf_used_ware
<class 'pandas.core.frame.DataFrame'> Int64Index: 5066 entries, 0 to 5065 Data columns (total 1 columns): 0 5066 non-null values dtypes: object(1)
pf_used_ware.columns = ['title']
pf_used_ware
<class 'pandas.core.frame.DataFrame'> Int64Index: 5066 entries, 0 to 5065 Data columns (total 1 columns): title 5066 non-null values dtypes: object(1)
software[:2]
[{'description': 'An audio plugin that emulates the 1176 limiter.', 'name': '1176 (plugin)', 'title': '1176', 'url': 'http://www.uaudio.com/store/compressors-limiters/1176-collection.html'}, {'description': 'A web service for uploading videos from mobile phones or webcams.', 'name': '12seconds', 'title': '12seconds', 'url': 'http://12seconds.tv/'}]
for el in software:
el['type'] = 'software'
for el in hardware:
el['type'] = 'hardware'
ware = hardware+software
ware[:3]
[{'description': 'A very powerful scanner.', 'name': '10000XL', 'title': '10000xl', 'type': 'hardware', 'url': 'http://www.epson.com/cgi-bin/Store/jsp/Product.do?sku=E10000XL-PH'}, {'description': 'Old speakers.', 'name': '104aB', 'title': '104ab', 'type': 'hardware', 'url': 'http://www.kef.com/html/us/explore/about_kef/museum/1970s/104aBKIT_CantataKIT/index.html'}, {'description': 'A subwoofer.', 'name': '10S', 'title': '10s', 'type': 'hardware', 'url': 'http://www.shopping.com/KRK-KRK-10S-Subwoofer/info?sb=1'}]
len(ware)
3326
len(hardware)
1373
len(software)
1953
1953+1373
3326
extracted_ware = pd.DataFrame(ware)
user_wares = pf_used_ware.merge(extracted_ware,on='title',how='inner')
user_wares[:20]
<class 'pandas.core.frame.DataFrame'> Int64Index: 20 entries, 0 to 19 Data columns (total 5 columns): title 20 non-null values description 20 non-null values name 20 non-null values type 20 non-null values url 20 non-null values dtypes: object(5)
user_wares.ix[user_wares.type=='hardware'].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109ce8dd0>
user_wares.ix[user_wares.type=='software'].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109cff5d0>
user_wares.ix[user_wares.description.str.contains('headphone')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x109ee27d0>
user_wares.ix[user_wares.description.str.contains('keyboard')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a19b990>
user_wares.ix[user_wares.description.str.contains('display')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a1e0790>
user_wares.ix[user_wares.description.str.contains('phone')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a27dfd0>
user_wares.ix[user_wares.description.str.contains('text')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a298890>
user_wares.ix[user_wares.description.str.contains('backup')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a79a2d0>
user_wares.ix[user_wares.description.str.contains('disk')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10a7b5110>
user_wares.ix[user_wares.description.str.contains('storage')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b41af10>
user_wares.ix[user_wares.description.str.contains('server')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b431850>
user_wares.ix[user_wares.description.str.contains('chair')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10b4e8ed0>
user_wares.ix[user_wares.description.str.contains('tablet')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10bc135d0>
user_wares.ix[user_wares.description.str.contains('console')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10bcb26d0>
user_wares.ix[user_wares.description.str.contains('terminal')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10bccf250>
user_wares.ix[user_wares.description.str.contains('pen')].title.value_counts()[:10].plot(kind='barh')
<matplotlib.axes.AxesSubplot at 0x10c1330d0>