某勾数据分析系列(三)之数据可视化

環境

python 3.9.0+ macos Big Sur 11.2.3+ Google Chrome 89.0.4389.114

python包 : pandas,re,matplotlib,numpy,os

正文

我們做好了相对应的数据处理,就能产生数据图表也就是可视化分析数据了。 首先,我们要做的数据处理包括,获取平均工资,学历,地点,工作年限 等等。

数据清洗

因为系列中上一篇文章讲了怎么清洗数据的知识,所以我这里直接上代码

class Clean:
    def __init__(self):
        self.data = pandas.read_json('newData.json')

        '''
        重組排序規則,使用自己的id作為索引
        '''
        self.data.index = self.data['id']
        del (self.data['id'])
        dfSort = self.data.sort_index()
        self.data = dfSort

    def wageDataRinse(self):
        self.data['bottom'] = self.data['top'] = self.data['average'] = self.data['positionWage']

        pattern = re.compile('([0-9]+)')
        q1 = q2 = q3 = q4 = 0

        for i in range(len(self.data['positionWage'])):
            item = self.data['positionWage'].iloc[i].strip()
            result = re.findall(pattern, item)
            try:
                if result:
                    try:
                        # 此语句执行成功则表示result[0],result[1]都存在,即职位月薪形如‘6000-8000元/月’
                        self.data['bottom'].iloc[i], self.data['top'].iloc[i] = result[0], result[1]
                        self.data['average'].iloc[i] = str((int(result[0]) + int(result[1])) / 2)
                        q1 += 1

                    except:
                        # 此语句执行成功则表示result[0]存在,result[1]不存在,职位月薪形如‘10000元/月以下’
                        self.data['bottom'].iloc[i] = df['top'].iloc[i] = result[0]
                        self.data['average'].iloc[i] = str((int(result[0]) + int(result[0])) / 2)
                        q2 += 1
                else:
                    # 此语句执行成功则表示【职位月薪】中并无数字形式存在,可能是‘面议’、‘found no element’
                    self.data['bottom'].iloc[i] = self.data['top'].iloc[i] = self.data['average'].iloc[i] = item
                    q3 += 1

            except Exception as e:
                q4 += 1
                # print(q4, item, repr(e))

        # return self.data


    def cityDataRinse(self):
        self.data['jobsCity'] = self.data['jobsAddress']
        pattern2 = re.compile('(.*?)(\-)')
        dfCity = self.data['jobsAddress'].copy()

        for i in range(len(dfCity)):
            item = dfCity.iloc[i].strip()
            result = re.search(pattern2, item)
            if result:
                # print(result.group(1).strip())
                dfCity.iloc[i] = result.group(1).strip()
            else:
                # print(item.strip())
                dfCity.iloc[i] = item.strip()

        self.data['jobsCity'] = dfCity

        # return self.data

    def jobExp(self):
        data = self.data['jobsExp'].value_counts()
        # print(data)

    '''
        工作经验-平均工资
    '''
    def averageMonthly(self):
        dfAvrMonth = self.data['average'].replace(['面议','found no element'],numpy.nan)
        averMonthData = pandas.DataFrame(data={'jobsExp': self.data['jobsExp'], 'avrMonth': dfAvrMonth})

        pattern = re.compile('([0-9]+)')
        avrlist = []
        for i in range(len(self.data['average'])):
            item = self.data['average'].iloc[i].strip()
            result = re.findall(pattern, item)
            # print(float(item)*1000)
            try:
                if result:
                    avrlist.append(float(float(item)*1000))
                elif (item.strip() == 'found no element' or item.strip() == '面议'):
                    avrlist.append(numpy.nan)
                else:
                    print(item)
            except Exception as e:
                print(item, type(item), repr(e))
        averMonthData['avrMonth'] = avrlist
        self.data['avrMonth'] = averMonthData['avrMonth']

        avrMonthGroup = averMonthData['avrMonth'].groupby(averMonthData['jobsExp'])
        # data = avrMonthGroup.mean()
        # print(data)
        # pandas.Series(data = {'平均值':averMonthData['avrMonth'].mean()})
        result = avrMonthGroup.mean().append(pandas.Series(data = {'平均值':averMonthData['avrMonth'].mean()}))
        # data1 = result3.sort_values(ascending=False).round(1)
        return result

    '''
        城市-平均工资
    '''
    def averCityMonthly(self):
        cityAvrMonth = pandas.DataFrame(data={'jobsCity': self.data['jobsCity'], 'avrMonth': self.data['avrMonth']})

        cityAvrMonthGroup = cityAvrMonth['avrMonth'].groupby(cityAvrMonth['jobsCity'])

        # cityAvrMonthGroup.count().sum()

        result = cityAvrMonthGroup.mean().append(pandas.Series(data={'平均值': self.data['avrMonth'].mean()}))# 新增一个平均值,即所有非空df2['平均月薪']的平均值

        # print(result.iloc[-1])
        # result4.sort_values(ascending=False).round(1) # sort_values()方法可以对值进行排序,默认按照升序,round(1)表示小数点后保留1位小数。
        return result

    '''
        学历-职位数量分布
    '''
    def averEducationNum(self):
        self.data['degreeReq'].value_counts()
        minEducation = self.data['degreeReq'].replace(['其他'], numpy.nan)
        minEducation.value_counts()
        minEducation.value_counts().sum()
        # self.data['minDegreeReq'] = minEducation

        return minEducation

    '''
        学历-平均工资分布
    '''
    def averEducationMonthly(self):
        eduMonthly = pandas.DataFrame(data={'degreeReq': self.data['degreeReq'], 'avrMonth': self.data['avrMonth']})
        EduMonthlyGroup = eduMonthly['avrMonth'].groupby(eduMonthly['degreeReq'])

        # print(EduMonthlyGroup.mean())
        return EduMonthlyGroup

    '''
       学历-工作经验-平均月薪分布
    '''
    def averJobsExpAndEduForMonthly(self):
        jobsExpAndEduForMonthly = pandas.DataFrame(data={'avrMonth': self.data['avrMonth'], 'degreeReq': self.data['degreeReq'], 'jobsExp': self.data['jobsExp']})
        jobsExpAndEduForMonthlyGroup = jobsExpAndEduForMonthly['avrMonth'].groupby([jobsExpAndEduForMonthly['degreeReq'], jobsExpAndEduForMonthly['jobsExp']])

        return jobsExpAndEduForMonthlyGroup

    def getALLData(self):
        return self.data

数据可视化

可视化,是个什么概念,意思就是通过把数据通过某个结构做出来的特点数据,这一块叫做分析。通过图表的方式展现出来,这就是可视化。 那当然,Excel的VBA也能做。但是在细节上的处理我觉得python会更甚一筹。而且python同步进行的数据清洗相对来说比Excel减低了很多的工作量 分析数据清洗数据有pandas包,可视化当然也有,那就是matplotlib

首先我要设置一个画布空间,set_size_inches是个这个画布尺寸,单位是英寸。1英寸等于 2.54 cm。

canvas = plt.figure(1, facecolor='black')  # 设置视图画布1
canvas.set_size_inches(18.5, 10.5)

然后就是画图了,看注释

    ax1_1 = canvas.add_subplot(2, 1, 1, facecolor='#4f4f4f',alpha=0.3)  # 在视图1中设置子图1,背景色灰色,透明度0.3(figure.add_subplot 和plt.suplot都行)
    plt.tick_params(colors='white')  # 设置轴的颜色为白色
    allData['jobsCity'].value_counts().plot(kind='bar', rot=0, color='#ef9d9a')  # 画直方图图

    # 设置图标题,x和y轴标题
    plt.title('城市——python在招职位数分布图', fontsize=18, color='yellow')  # 设置标题
    plt.xlabel('城市', fontsize=14, color='yellow')  # 设置X轴轴标题
    plt.ylabel('职位数量', fontsize=14, color='yellow')  # 设置Y轴轴标题

    # 设置说明,位置在图的右上角
    ax1_1.text(13, 453, '城市总数:'+str(len(allData['jobsCity'].value_counts().values))+'(个)', fontsize=12, color='cyan')  # 设置说明,位置在图的右上角
    ax1_1.text(13, 403, '职位总数:'+str(allData.shape[0])+'(条)', fontsize=12, color='cyan')
    ax1_1.text(13, 353, '有效职位:'+str(allData.shape[0])+'(条)', fontsize=12, color='red')

    # 添加每一个城市的坐标值
    list_1 = allData['jobsCity'].value_counts().values
    for i in range(len(list_1)):
        ax1_1.text(i - 0.3, list_1[i], str(list_1[i]), color='yellow')

    plt.show()

然后这就是一张画好的图了 1618989734200-3qo.png

就是这么简单。所以我做了几个示例。

全部代码

1.main.py

import pandas
import re
import numpy
import drawVisual
import os

'''判断文件夹是否存在'''
def dirNotExistCreate(path):
    if not os.path.exists(path):
        os.mkdir(path)


class Clean:
    def __init__(self):
        self.data = pandas.read_json('newData.json')

        '''
        重組排序規則,使用自己的id作為索引
        '''
        self.data.index = self.data['id']
        del (self.data['id'])
        dfSort = self.data.sort_index()
        self.data = dfSort

    def wageDataRinse(self):
        self.data['bottom'] = self.data['top'] = self.data['average'] = self.data['positionWage']

        pattern = re.compile('([0-9]+)')
        q1 = q2 = q3 = q4 = 0

        for i in range(len(self.data['positionWage'])):
            item = self.data['positionWage'].iloc[i].strip()
            result = re.findall(pattern, item)
            try:
                if result:
                    try:
                        # 此语句执行成功则表示result[0],result[1]都存在,即职位月薪形如‘6000-8000元/月’
                        self.data['bottom'].iloc[i], self.data['top'].iloc[i] = result[0], result[1]
                        self.data['average'].iloc[i] = str((int(result[0]) + int(result[1])) / 2)
                        q1 += 1

                    except:
                        # 此语句执行成功则表示result[0]存在,result[1]不存在,职位月薪形如‘10000元/月以下’
                        self.data['bottom'].iloc[i] = df['top'].iloc[i] = result[0]
                        self.data['average'].iloc[i] = str((int(result[0]) + int(result[0])) / 2)
                        q2 += 1
                else:
                    # 此语句执行成功则表示【职位月薪】中并无数字形式存在,可能是‘面议’、‘found no element’
                    self.data['bottom'].iloc[i] = self.data['top'].iloc[i] = self.data['average'].iloc[i] = item
                    q3 += 1

            except Exception as e:
                q4 += 1
                # print(q4, item, repr(e))

        # return self.data


    def cityDataRinse(self):
        self.data['jobsCity'] = self.data['jobsAddress']
        pattern2 = re.compile('(.*?)(\-)')
        dfCity = self.data['jobsAddress'].copy()

        for i in range(len(dfCity)):
            item = dfCity.iloc[i].strip()
            result = re.search(pattern2, item)
            if result:
                # print(result.group(1).strip())
                dfCity.iloc[i] = result.group(1).strip()
            else:
                # print(item.strip())
                dfCity.iloc[i] = item.strip()

        self.data['jobsCity'] = dfCity

        # return self.data

    def jobExp(self):
        data = self.data['jobsExp'].value_counts()
        # print(data)

    '''
        平均工资
    '''
    def averageMonthly(self):
        dfAvrMonth = self.data['average'].replace(['面议','found no element'],numpy.nan)
        averMonthData = pandas.DataFrame(data={'jobsExp': self.data['jobsExp'], 'avrMonth': dfAvrMonth})

        pattern = re.compile('([0-9]+)')
        avrlist = []
        for i in range(len(self.data['average'])):
            item = self.data['average'].iloc[i].strip()
            result = re.findall(pattern, item)
            # print(float(item)*1000)
            try:
                if result:
                    avrlist.append(float(float(item)*1000))
                elif (item.strip() == 'found no element' or item.strip() == '面议'):
                    avrlist.append(numpy.nan)
                else:
                    print(item)
            except Exception as e:
                print(item, type(item), repr(e))
        averMonthData['avrMonth'] = avrlist
        self.data['avrMonth'] = averMonthData['avrMonth']

        avrMonthGroup = averMonthData['avrMonth'].groupby(averMonthData['jobsExp'])
        # data = avrMonthGroup.mean()
        # print(data)
        # pandas.Series(data = {'平均值':averMonthData['avrMonth'].mean()})
        result = avrMonthGroup.mean().append(pandas.Series(data = {'平均值':averMonthData['avrMonth'].mean()}))
        # data1 = result3.sort_values(ascending=False).round(1)
        return result

    '''
        城市-平均工资
    '''
    def averCityMonthly(self):
        cityAvrMonth = pandas.DataFrame(data={'jobsCity': self.data['jobsCity'], 'avrMonth': self.data['avrMonth']})

        cityAvrMonthGroup = cityAvrMonth['avrMonth'].groupby(cityAvrMonth['jobsCity'])

        # cityAvrMonthGroup.count().sum()

        result = cityAvrMonthGroup.mean().append(pandas.Series(data={'平均值': self.data['avrMonth'].mean()}))# 新增一个平均值,即所有非空df2['平均月薪']的平均值

        # print(result.iloc[-1])
        # result4.sort_values(ascending=False).round(1) # sort_values()方法可以对值进行排序,默认按照升序,round(1)表示小数点后保留1位小数。
        return result

    '''
        学历-职位数量分布
    '''
    def averEducationNum(self):
        self.data['degreeReq'].value_counts()
        minEducation = self.data['degreeReq'].replace(['其他'], numpy.nan)
        minEducation.value_counts()
        minEducation.value_counts().sum()
        # self.data['minDegreeReq'] = minEducation

        return minEducation

    '''
        学历-平均工资分布
    '''
    def averEducationMonthly(self):
        eduMonthly = pandas.DataFrame(data={'degreeReq': self.data['degreeReq'], 'avrMonth': self.data['avrMonth']})
        EduMonthlyGroup = eduMonthly['avrMonth'].groupby(eduMonthly['degreeReq'])

        # print(EduMonthlyGroup.mean())
        return EduMonthlyGroup

    def averJobsExpAndEduForMonthly(self):
        jobsExpAndEduForMonthly = pandas.DataFrame(data={'avrMonth': self.data['avrMonth'], 'degreeReq': self.data['degreeReq'], 'jobsExp': self.data['jobsExp']})
        jobsExpAndEduForMonthlyGroup = jobsExpAndEduForMonthly['avrMonth'].groupby([jobsExpAndEduForMonthly['degreeReq'], jobsExpAndEduForMonthly['jobsExp']])

        return jobsExpAndEduForMonthlyGroup

    def getALLData(self):
        return self.data

if __name__ == '__main__':
    data = pandas.read_json('newData.json')
    dirNotExistCreate('./canvas/')

    '''
        清洗数据
    '''
    clean = Clean()
    clean.wageDataRinse()
    clean.cityDataRinse()
    clean.jobExp()
    avrMonth = clean.averageMonthly()
    cityAvrMonth = clean.averCityMonthly()
    eduAvrNum = clean.averEducationNum()
    eduMonthly = clean.averEducationMonthly()
    jobsExpAndEduForMonthly = clean.averJobsExpAndEduForMonthly()

    resData = clean.getALLData()

    '''
        数据分析之可视化
    '''
    draw = drawVisual.Draw()
    draw.drawCityNum(resData,'cityNum.jpg')
    draw.drawJobsExp(resData,'jobsExp.jpg')
    draw.drawJobExpAndAvrMonth(avrMonth,resData,'jobExpavrMonth.jpg')
    draw.drawCityAndAvrMonth(cityAvrMonth,resData,'cityAvrMonth.jpg')
    draw.drawEduAndCount(eduAvrNum,resData,'eduCount.jpg')
    draw.drawEduMonthly(eduMonthly,resData,'eduAvrMonth.jpg')
    draw.drawJobsExpAndEduForMonthly(jobsExpAndEduForMonthly,resData,'jobExpEduMonth.jpg')

2.drawVisual.py

import matplotlib.pyplot as plt
import numpy
import matplotlib.ticker as mticker

class Draw:
    def __init__(self):
        plt.rcParams['font.sans-serif'] = ['Heiti TC']
        self.key = 'Python'
        self.dataSource = '拉勾网'

    '''
        城市职位数量
    '''
    def drawCityNum(self,allData,fileName):
        canvas = plt.figure(1, facecolor='black')  # 设置视图画布1
        canvas.set_size_inches(18.5, 10.5)

        ax1_1 = canvas.add_subplot(2, 1, 1, facecolor='#4f4f4f',alpha=0.3)  # 在视图1中设置子图1,背景色灰色,透明度0.3(figure.add_subplot 和plt.suplot都行)
        plt.tick_params(colors='white')  # 设置轴的颜色为白色
        allData['jobsCity'].value_counts().plot(kind='bar', rot=0, color='#ef9d9a')  # 画直方图图

        # 设置图标题,x和y轴标题
        plt.title(self.dataSource + self.key + '城市在招职位数分布图', fontsize=18, color='yellow')  # 设置标题
        plt.xlabel('城市', fontsize=14, color='yellow')  # 设置X轴轴标题
        plt.ylabel('职位数量', fontsize=14, color='yellow')  # 设置Y轴轴标题

        # 设置说明,位置在图的右上角
        ax1_1.text(13, 453, '城市总数:'+str(len(allData['jobsCity'].value_counts().values))+'(个)', fontsize=12, color='cyan')  # 设置说明,位置在图的右上角
        ax1_1.text(13, 403, '职位总数:'+str(allData.shape[0])+'(条)', fontsize=12, color='cyan')
        ax1_1.text(13, 353, '有效职位:'+str(allData.shape[0])+'(条)', fontsize=12, color='red')

        # 添加每一个城市的坐标值
        list_1 = allData['jobsCity'].value_counts().values
        for i in range(len(list_1)):
            ax1_1.text(i - 0.3, list_1[i], str(list_1[i]), color='yellow')

        # 可以用plt.grid(True)添加栅格线
        # 可以用下面语句添加注释箭头。指向上海,xy为坐标值、xytext为注释坐标值,facecolor为箭头颜色。
        # arrow = plt.annotate('职位数:xxx', xy=(1,xxx), xytext=(3, xxx),color='blue',arrowprops=dict(facecolor='blue', shrink=0.05))
        ax1_2 = canvas.add_subplot(2, 1, 2)  # 设置子图2,是位于子图1下面的饼状图

        x = allData['jobsCity'].value_counts().values  # x是数值列表,pie图的比例根据数值占整体的比例而划分
        label_list = []  # label_list是构造的列表,装的是前8个城市的名称+职位占比。
        for i in range(15):
            t = allData['jobsCity'].value_counts().values[i] / allData['jobsCity'].value_counts().sum() * 100
            city = allData['jobsCity'].value_counts().index[i]
            percent = str('%.1f%%' % t)
            label_list.append(city + percent)

        # labels参数原本是与数值对应的标签列表,此处30个城市过多,所以只取了前8个城市显示。
        # explode即饼图中分裂的效果explode=(0.1,1,1,。。)表示第一块图片显示为分裂效果
        labels = label_list
        explode = tuple([0.1] + [0] * 14)
        # x = [471, 305, 278, 189, 107, 98, 71, 36, 30, 29, 22, 11, 11, 8, 1]
        # print(len(labels),len(x),len(explode))
        plt.pie(x, explode=explode, labels=labels, textprops={'color': 'yellow'})

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    '''工作经验——职位数分布图'''
    def drawJobsExp(self,data,fileName):
        canvas = plt.figure(2, facecolor='black')
        canvas.set_size_inches(18.5,10.5)


        '''图表1-1-1'''
        ax1_1 = canvas.add_subplot(2, 1, 1, facecolor='#4f4f4f', alpha=0.3)
        data['jobsExp'].value_counts().plot(kind='bar', rot=0, color='#7fc8ff')
        plt.tick_params(colors='white')
        plt.title(self.dataSource + self.key + '工作经验——职位数分布图', fontsize=18, color='yellow')#标题
        plt.xlabel('工作经验', fontsize=14, color='yellow')#x轴
        plt.ylabel('职位数量', fontsize=14, color='yellow')#y轴
        plt.grid(True)
        ax1_1.text(3, 880, '城市总数:'+ str(len(data['jobsCity'].value_counts().values))+'(个)', fontsize=12, color='yellow')
        ax1_1.text(3, 800, '职位总数:'+str(data.shape[0])+'(条)', fontsize=12, color='yellow')
        ax1_1.text(3, 720, '有效职位:'+str(data.shape[0])+'(条)', fontsize=12, color='cyan')

        '''图表1-1-2'''
        ax1_2 = canvas.add_subplot(2, 1, 2)

        # x是数值列表,pie图的比例根据数值占整体的比例而划分
        x2 = data['jobsExp'].value_counts().values
        labels = list(data['jobsExp'].value_counts().index[:len(x2)])
        explode = tuple([0.1] * len(x2))
        plt.pie(x2, explode=explode, labels=labels, autopct='%1.1f%%', textprops={'color': 'yellow'})
        plt.axis('equal')  # 显示为等比例圆形
        ax1_2.legend(loc='lower right', shadow=True, fontsize=12, edgecolor='cyan')# 设置图例,方位为右下角

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    '''工作经验——平均月薪分布图'''
    def drawJobExpAndAvrMonth(self,data,allData,fileName):
        avrData = data.iloc[-1]
        plt.style.use('ggplot')
        canvas = plt.figure(3, facecolor='black')
        canvas.set_size_inches(18.5, 10.5)

        ax3 = canvas.add_subplot(1, 1, 1, facecolor='#4f4f4f', alpha=0.3)
        data.sort_values(ascending=False).round(1).plot(kind='barh', rot=0)

        # 设置标题、x轴、y轴的标签文本
        plt.title(self.dataSource + self.key + '工作经验——平均月薪分布图', fontsize=18, color='yellow')
        plt.xlabel('平均月薪', fontsize=14, color='yellow')
        plt.ylabel('工作经验', fontsize=14, color='yellow')

        # 添加值标签
        list3 = data.sort_values(ascending=False).values
        for i in range(len(list3)):
            ax3.text(list3[i], i, str(int(list3[i])), color='yellow')

        # 设置标识箭头
        plt.annotate('平均月薪:'+str(int(avrData))+'元/月', xy=(14197, 2.25), xytext=(20000, 3.05), color='yellow', fontsize=16, arrowprops=dict(facecolor='cyan', shrink=0.05))
        ax3.text(30250, 4.25, '月薪样本数:'+str(allData.shape[0])+'(个)', fontsize=16, color='cyan')# 设置图例注释(alldata['xxx'].value_counts().sum())

        # 设置轴刻度文字颜色为白色
        plt.tick_params(colors='white')

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    '''
        城市-月均工资
    '''
    def drawCityAndAvrMonth(self,data,allData,fileName):
        avrData = data.iloc[-1]
        plt.style.use('dark_background')
        canvas = plt.figure(4)
        canvas.set_size_inches(18.5, 10.5)

        ax = canvas.add_subplot(1, 1, 1)  # 可选facecolor='#4f4f4f',alpha=0.3,设置子图,背景色灰色,透明度0.3
        data.sort_values(ascending=False).round(1).plot(kind='bar', rot=30)  # 可选color='#ef9d9a'

        plt.title(self.dataSource + self.key + u'城市——平均月薪分布图', fontsize=18, color='yellow')  # 设置标题
        plt.xlabel(u'城市', fontsize=14, color='yellow')  # 设置X轴轴标题
        plt.ylabel(u'平均月薪', fontsize=14, color='yellow')  # 设置Y轴轴标题

        ax.text(13.5, 18800, u'城市总数:'+str(len(allData['jobsCity'].value_counts().values))+'(个)', fontsize=12, color='#FF00FF')  # 设置说明,位置在图的右上角
        ax.text(13.5, 18200, u'月薪样本数:'+str(allData.shape[0])+'(条)', fontsize=12, color='#FF00FF')


        listData = data.sort_values(ascending=False).values # 添加每一个城市的坐标值
        for i in range(len(listData)):
            ax.text(i-0.25, listData[i], int(listData[i]), color='yellow')

        plt.annotate(u'全国月薪平均值:'+str(int(avrData))+'元/月', xy=(4.3, 14197), xytext=(4.8, 18000), color='#9B30FF', fontsize=14,arrowprops=dict(facecolor='#FF00FF', shrink=0.05))# 设置箭头注释

        plt.tick_params(colors='pink')# 设置轴刻度文字颜色为粉色

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    def drawEduAndCount(self,data,allData,fileName):
        plt.style.use('dark_background')
        canvas = plt.figure(5)
        canvas.set_size_inches(18.5, 10.5)

        ax1_1 = canvas.add_subplot(2, 1, 1)  # 可选facecolor='#4f4f4f',alpha=0.3
        data.value_counts().plot(kind='bar', rot=0)  # color='#7fc8ff'

        # 设置标题、x轴和y轴标题、图例文字
        plt.title(self.dataSource + self.key + u'要求学历——职位数分布图', fontsize=18, color='yellow')
        plt.xlabel(u'要求学历', fontsize=14, color='yellow')
        plt.ylabel(u'职位数量', fontsize=14, color='yellow')
        ax1_1.text(2, 1200, u'职位总数:'+ str(allData.shape[0]) +'(条)', fontsize=14, color='#B452CD')

        # 设置坐标轴的的颜色和文字大小
        plt.tick_params(colors='#9F79EE', labelsize=13)

        # 设置坐标值文字
        list5 = data.value_counts().values
        for i in range(len(list5)):
            ax1_1.text(i - 0.1, list5[i], int(list5[i]), color='yellow')

        ax1_2 = canvas.add_subplot(2, 1, 2)
        xl = data.value_counts().values

        labels = list(data.value_counts().index)
        explode = tuple([0.1] * len(xl))
        plt.pie(xl, explode=explode, labels=labels, autopct='%1.1f%%', textprops={'color': '#B452CD'})
        plt.axis('equal')

        ax1_2.legend(loc='lower right', shadow=True, fontsize=12, edgecolor='#B452CD')
        plt.tick_params(colors='#9F79EE', labelsize=13)

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    def drawEduMonthly(self,data,allData,fileName):
        plt.style.use('ggplot')
        canvas = plt.figure(6, facecolor='black')
        canvas.set_size_inches(18.5, 10.5)

        ax = canvas.add_subplot(1, 1, 1, facecolor='#4f4f4f', alpha=0.3)
        data.mean().round(1).sort_values().plot(color='cyan')  # 在条形图上叠加一个折线图
        data.mean().round(1).sort_values().plot(kind='bar', rot=0)

        # 设置标题、x轴、y轴的标签文本
        plt.title(self.dataSource + self.key + u'要求学历——平均月薪分布图', fontsize=18, color='yellow')
        plt.xlabel(u'要求学历', fontsize=14, color='yellow')
        plt.ylabel(u'平均月薪', fontsize=14, color='yellow')

        # 添加值标签(坐标值文字)
        listData = data.mean().round(1).sort_values().values
        for i in range(len(listData)):
            ax.text(i - 0.1, listData[i], int(listData[i]), color='yellow')

        # 设置图例注释
        ax.text(2, 17500, u'月薪样本数:'+str(allData.shape[0])+'(个)', fontsize=16, color='cyan')

        # 设置轴刻度的文字颜色
        plt.tick_params(colors='#9F79EE')

        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

    def drawJobsExpAndEduForMonthly(self,data,allData,fileName):
        xlist = list(data.mean().round(1)[:, '3-5年'].sort_values().index)
        print(xlist)
        data.mean().round(1)[:, '1-3年'].reindex(xlist)

        plt.style.use('dark_background')
        canvas = plt.figure(7, facecolor='black')
        canvas.set_size_inches(18.5, 10.5)

        ax = canvas.add_subplot(1, 1, 1, facecolor='#4f4f4f', alpha=0.3)
        title = plt.title(self.dataSource + self.key +u'要求学历-工作经验-平均月薪分布图', fontsize=18, color='yellow')
        xlabel = plt.xlabel(u'要求学历', fontsize=14, color='yellow')
        ylabel = plt.ylabel(u'平均月薪', fontsize=14, color='yellow')
        plt.tick_params(colors='cyan')

        label_format = '{:,.0f}'

        x2 = allData['jobsExp'].value_counts().keys()
        # print(x2)
        j = 1
        keysList = []
        for i in x2:
            locals()['list'+str(j)] = data.mean().round(1)[:, i].reindex(xlist).values
            j = j+1
            keysList.append(i)

        ind = numpy.arange(len(xlist))  # ind为x轴宽度,用numpy的array形式表示
        width = 0.1  # 条形图的宽度,要合理设置否则太宽会摆不下
        e = 1
        imgList = []
        for f in x2:
            # print(locals()['list'+str(e)])
            if e == 1:
                locals()['img' + str(e)] = ax.bar(ind, locals()['list'+str(e)], width)
            else:
                locals()['img' + str(e)] = ax.bar(ind + width * (e -1), locals()['list' + str(e)], width)
            imgList.append(locals()['img' + str(e)][0])
            e = e+1

        # # 设置X轴文本和位置调整

        # ax.xaxis.set_major_locator(mticker.FixedLocator([0,1,2]))

        ax.set_xticklabels(xlist)
        # ax.set_xticklabels([label_format.format(x) for x in ticks_loc])
        ax.set_xticks(ind + width / 2)
        #
        # # 设置文字说明
        text1 = ax.text(0.1, 34100, u'数据来源:' + self.dataSource, fontsize=13, color='#9F79EE')
        text2 = ax.text(0.1, 33200, u'职位关键词:' + self.key, fontsize=13, color='#9F79EE')
        text3 = ax.text(0.1, 32200, u'工作城市:全国'+str(len(allData['jobsCity'].value_counts().values))+'座城市', fontsize=13, color='#9F79EE')
        text4 = ax.text(0.1, 31200, u'职位数量:共计'+str(allData.shape[0])+'(条)', fontsize=13, color='#9F79EE')

        ax.legend(imgList,keysList,fontsize=13,facecolor='black',loc='upper left')
        #
        # # 设置栅格
        # plt.grid(True)
        #
        if not fileName:
            plt.show()
        else:
            plt.savefig('./canvas/'+fileName,dpi=100)

输出结果

cityNum.jpg jobsExp.jpg jobExpavrMonth.jpg cityAvrMonth.jpg eduCount.jpg eduAvrMonth.jpg jobExpEduMonth.jpg

----------end

本文为ctexthuang原创文章,转载请注明来自ctexthuang_blog

tag(s): none
show comments · back · home
Edit with Markdown
召唤看板娘