{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# データを統計量で記述する" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 平均を求める" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "....\n", "----------------------------------------------------------------------\n", "Ran 4 tests in 0.003s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "平均を計算\n", "'''\n", "def calculate_mean(numbers): \n", " s = sum(numbers)\n", " N = len(numbers)\n", " mean = s / N\n", "\n", " return mean\n", "\n", "\n", "class TestCalculateMean(unittest.TestCase):\n", " \n", " def test_01(self):\n", " donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]\n", " mean = calculate_mean(donations)\n", " N = len(donations)\n", " self.assertEqual(mean, 477.75)\n", " self.assertEqual(N, 12)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 中央値を求める" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ".......\n", "----------------------------------------------------------------------\n", "Ran 7 tests in 0.006s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\b\n", "中央値を計算\n", "'''\n", "def calculate_median(numbers):\n", " N = len(numbers)\n", " numbers.sort()\n", "\n", " if N % 2 == 0:\n", " m1 = N / 2\n", " m2 = (N / 2) + 1\n", " m1 = int(m1) - 1\n", " m2 = int(m2) - 1\n", " median = (numbers[m1] + numbers[m2]) / 2\n", " else:\n", " m = (N + 1) / 2\n", " m = int(m) - 1\n", " median = numbers[m]\n", "\n", " return median\n", "\n", "\n", "class TestListCalculateMedian(unittest.TestCase):\n", " def test_01(self):\n", " donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]\n", " median = calculate_median(donations)\n", " N = len(donations)\n", " self.assertEqual(median, 500)\n", " \n", " def test_02(self):\n", " donations = [60, 70, 100, 900]\n", " median = calculate_median(donations)\n", " N = len(donations)\n", " self.assertEqual(median, 85)\n", " \n", " def test_03(self):\n", " donations = [60, 70, 100]\n", " median = calculate_median(donations)\n", " N = len(donations)\n", " self.assertEqual(median, 70)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 最頻値を求め度数分布表を作る\n", "### 一番多い要素を見つける" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(4, 2), (2, 1), (1, 1), (3, 1)]\n", "[(4, 2)]\n", "[(4, 2), (2, 1)]\n", "[(4, 2)]\n", "4\n" ] } ], "source": [ "from collections import Counter\n", "\n", "simplelist = [4, 2, 1, 3, 4]\n", "c = Counter(simplelist)\n", "print(c.most_common())\n", "print(c.most_common(1))\n", "print(c.most_common(2))\n", "\n", "mode = c.most_common(1)\n", "print(mode)\n", "print(mode[0][0])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 最頻値を探す" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "........\n", "----------------------------------------------------------------------\n", "Ran 8 tests in 0.008s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "最頻値を計算\n", "'''\n", "def calculate_mode(numbers):\n", " c = Counter(numbers)\n", " mode = c.most_common(1)\n", " return mode[0][0]\n", "\n", "\n", "class TestListCalculateMode(unittest.TestCase):\n", " def test_01(self):\n", " scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 15, 6, 7, 8, 6, 1, 10]\n", " mode = calculate_mode(scores)\n", " self.assertEqual(mode, 9)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ".........\n", "----------------------------------------------------------------------\n", "Ran 9 tests in 0.005s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "数のリストに複数の最頻値があるときに最頻値を計算\n", "'''\n", "def calculate_modes(numbers):\n", " c = Counter(numbers)\n", " numbers_freq = c.most_common()\n", " max_count = numbers_freq[0][1]\n", "\n", " modes = []\n", " for num in numbers_freq:\n", " if num[1] == max_count:\n", " modes.append(num[0])\n", " return modes\n", "\n", "\n", "class TestListCalculateModes(unittest.TestCase):\n", " def test_01(self):\n", " scores = [5, 5, 5, 4, 4, 4, 9, 1, 3]\n", " modes = calculate_modes(scores)\n", " self.assertEqual(modes, [5, 4])\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 度数分布を作る\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "| 点数 | 頻度 |\n", "| - | - |\n", "| 1 | 2 |\n", "|2|1|\n", "|4|1|\n", "|5|2|\n", "|6|3|\n", "|7|2|\n", "|8|2|\n", "|9|5|\n", "|10|2|\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number\tFrequency\n", "9\t5\n", "6\t3\n", "7\t2\n", "8\t2\n", "10\t2\n", "5\t2\n", "1\t2\n", "2\t1\n", "4\t1\n" ] } ], "source": [ "\"\"\"\n", "数のリストの度数分布表\n", "\"\"\"\n", "def frequency_table(numbers):\n", " table = Counter(numbers)\n", " print('Number\\tFrequency')\n", " \n", " for number in table.most_common():\n", " print('{0}\\t{1}'.format(number[0], number[1]))\n", "\n", "\n", "if __name__ == '__main__':\n", " scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 1, 5, 6, 7, 8, 6, 1, 10]\n", " frequency_table(scores)\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number\tFrequency\n", "1\t2\n", "2\t1\n", "4\t1\n", "5\t2\n", "6\t3\n", "7\t2\n", "8\t2\n", "9\t5\n", "10\t2\n" ] } ], "source": [ "\"\"\"\n", "数のリストの度数分布表\n", "数の\b順に表示するよう修正\n", "\"\"\"\n", "def frequency_sorted_table(numbers):\n", " table = Counter(numbers)\n", " numbers_freq = table.most_common()\n", " numbers_freq.sort()\n", " \n", " print('Number\\tFrequency')\n", " for number in numbers_freq:\n", " print('{0}\\t{1}'.format(number[0], number[1]))\n", " \n", "\n", "if __name__ == '__main__':\n", " scores = [7, 8, 9, 2, 10, 9, 9, 9, 9, 4, 5, 6, 1, 5, 6, 7, 8, 6, 1, 10]\n", " frequency_sorted_table(scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 散らばりを測る\n", "### 数集合の範囲を求める" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "..........\n", "----------------------------------------------------------------------\n", "Ran 10 tests in 0.007s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "範囲を決める\n", "'''\n", "def find_range(numbers):\n", " lowest = min(numbers)\n", " highest = max(numbers)\n", " f = highest - lowest\n", "\n", " return lowest, highest, f\n", "\n", "\n", "class TestFindRange(unittest.TestCase):\n", " def test_01(self):\n", " donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]\n", " lowest, highest, r = find_range(donations)\n", " self.assertEqual(lowest, 60)\n", " self.assertEqual(highest, 1200)\n", " self.assertEqual(r, 1140)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 分散と標準偏差を求める" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$\\frac{\\sum(x_i - x_{avg})^2}{n}$" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "..........\n", "----------------------------------------------------------------------\n", "Ran 10 tests in 0.007s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "数のリストの分散と標準偏差を求める\n", "'''\n", "def calculate_variance(numbers):\n", " s = sum(numbers)\n", " N = len(numbers)\n", " mean = s / N\n", "\n", " diff = []\n", " for num in numbers:\n", " diff.append(num - mean)\n", "\n", " squared_diff = []\n", " for d in diff:\n", " squared_diff.append(d ** 2)\n", " sum_squared_diff = sum(squared_diff)\n", " variance = sum_squared_diff / len(numbers)\n", " return variance\n", "\n", "\n", "class TestCalculateVariance(unittest.TestCase):\n", " def test_01(self):\n", " donations = [100, 60, 70, 900, 100, 200, 500, 500, 503, 600, 1000, 1200]\n", " variance = calculate_variance(donations)\n", " self.assertEqual(variance, 141047.35416666666)\n", " std = variance ** 0.5\n", " self.assertEqual(std, 375.5627166887931)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2つのデータセットの相関を計算する\n", "### 相関係数を計算する" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$\\frac{n\\sum xy - \\sum x\\sum y}{\\sqrt{n \\sum x^2 -(\\sum x)^2(n\\sum y^2 - (\\sum y)^2)}}$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$\\sum xy$ 2つの数集合$x$と$y$の個別要素の積和 \n", "\n", "$\\sum x$  集合$x$の数の和 \n", "\n", "$\\sum y$  集合$y$の数の和 \n", "\n", "$(\\sum x)^2$ 集合$x$の数の和の2乗 \n", "\n", "$(\\sum y)^2$ 集合$y$の数の和の2乗 \n", "\n", "$\\sum x^2$  集合$x$の数の和の2乗 \n", "\n", "$\\sum y^2$  集合$y$の数の和の2乗" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "..........\n", "----------------------------------------------------------------------\n", "Ran 10 tests in 0.006s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "'''\n", "相関係数を計算するプログラム\n", "'''\n", "def find_corr_x_y(x, y):\n", " n = len(x)\n", " # 積の和を求める\n", " prod = []\n", " for xi, yi in zip(x, y):\n", " prod.append(xi * yi)\n", " sum_prod_x_y = sum(prod)\n", " sum_x = sum(x)\n", " sum_y = sum(y)\n", " squared_sum_x = sum_x ** 2\n", " squared_sum_y = sum_y ** 2\n", " x_square = []\n", " for xi in x:\n", " x_square.append(xi ** 2)\n", " # 和を求める\n", " x_square_sum = sum(x_square)\n", " y_square = []\n", " for yi in y:\n", " y_square.append(yi ** 2)\n", " # 和を求める \n", " y_square_sum = sum(y_square)\n", " # 式を使って相関を計算\n", " numerator = n * sum_prod_x_y - sum_x * sum_y\n", " denominatior_term1 = n * x_square_sum - squared_sum_x\n", " denominatior_term2 = n * y_square_sum - squared_sum_y\n", " denominator = (denominatior_term1*denominatior_term2)**0.5\n", " correlation = numerator/denominator\n", " \n", " return correlation\n", "\n", "\n", "class TestFindCorr(unittest.TestCase):\n", " def test_01(self):\n", " x = [1, 2, 3]\n", " y = [1, 2, 3]\n", " corr = find_corr_x_y(x, y)\n", " self.assertEqual(corr, 1)\n", "\n", " x = [1, 2, 3]\n", " y = [-1, -2, -3]\n", " corr = find_corr_x_y(x, y)\n", " self.assertEqual(corr, -1)\n", "\n", " x = [1, 2, 3]\n", " y = [1, -2, 3]\n", " corr = find_corr_x_y(x, y)\n", " self.assertEqual(corr, 0.39735970711951313)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 散布図" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: seaborn in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (0.9.0)\r\n", "Requirement already satisfied: pandas>=0.15.2 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from seaborn) (0.23.4)\r\n", "Requirement already satisfied: scipy>=0.14.0 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from seaborn) (1.1.0)\r\n", "Requirement already satisfied: numpy>=1.9.3 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from seaborn) (1.15.1)\r\n", "Requirement already satisfied: matplotlib>=1.4.3 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from seaborn) (3.0.0)\r\n", "Requirement already satisfied: pytz>=2011k in /Users/k2works/.local/lib/python3.7/site-packages (from pandas>=0.15.2->seaborn) (2018.5)\r\n", "Requirement already satisfied: python-dateutil>=2.5.0 in /Users/k2works/.local/lib/python3.7/site-packages (from pandas>=0.15.2->seaborn) (2.7.3)\r\n", "Requirement already satisfied: cycler>=0.10 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from matplotlib>=1.4.3->seaborn) (0.10.0)\r\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from matplotlib>=1.4.3->seaborn) (1.0.1)\r\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from matplotlib>=1.4.3->seaborn) (2.2.1)\r\n", "Requirement already satisfied: six>=1.5 in /Users/k2works/.local/lib/python3.7/site-packages (from python-dateutil>=2.5.0->pandas>=0.15.2->seaborn) (1.11.0)\r\n", "Requirement already satisfied: setuptools in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib>=1.4.3->seaborn) (39.0.1)\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install seaborn" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAADkBJREFUeJzt3X9s3Pddx/HXC9uj167MaDlG47R4f1kaHavLqSoUTaOleLApjUb/CFKBDqFIILENkCfMH6Dxz/4wQhsgMVkZ0LGtW1dcq0TrvEitNE1imS5xtvSXURntVqeQ65DbdZyG4735w+eQuOfc98h97+7tPB+S1fP3vjq/P/k2z1y+972cI0IAgDx+ZNADAAC6Q7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACQzWsaD7tu3LyYnJ8t4aADYk06ePPlyRFSL7FtKuCcnJ1Wv18t4aADYk2y/UHRfTpUAQDKEGwCSIdwAkAzhBoBkCDcAJFMo3Lb/wPZTtp+0/aDta8oeDADQXsfLAW1PSPqApLdFRNP2Q5IOS/qHkmcDgKG3tLKm+eVVnV1vav94RbMzUzo0PVHqzyx6HfeopIrtDUnXSjpb3kgAkMPSyprmFs+oubEpSVpbb2pu8YwklRrvjqdKImJN0l9I+raklyS9EhFfLm0iAEhifnn1QrS3NTc2Nb+8WurP7Rhu2z8u6R5Jb5W0X9J1tu9rs98R23Xb9Uaj0ftJAWDInF1vdrW9V4q8OPlLkv49IhoRsSFpUdLP79wpIhYiohYRtWq10NvtASC1/eOVrrb3SpFwf1vS7bavtW1Jd0l6ptSpACCB2ZkpVcZGLtlWGRvR7MxUqT+344uTEXHC9sOSTkk6L2lF0kKpUwFAAtsvQPb7qhJHRM8ftFarBf86IAAUZ/tkRNSK7Ms7JwEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACTTMdy2p2yfvujrVdsf6sdwAIDXG+20Q0SsSrpFkmyPSFqT9EjJcwEAdtHtqZK7JP1bRLxQxjAAgM66DfdhSQ+2u8P2Edt12/VGo3HlkwEA2iocbttvkHRQ0hfa3R8RCxFRi4hatVrt1XwAgB26ecb9K5JORcR/ljUMAKCzbsL969rlNAkAoH8Khdv2dZLulrRY7jgAgE46Xg4oSRHxfUlvLnkWAEABvHMSAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMqNFdrI9LumopJslhaTfjoh/KXMwAK+3tLKm+eVVnV1vav94RbMzUzo0PTHosdBnhcIt6eOSvhQR99p+g6RrS5wJQBtLK2uaWzyj5samJGltvam5xTOSRLyvMh1Pldh+k6R3SvqkJEXE/0TEetmDAbjU/PLqhWhva25san55dUATYVCKnON+q6SGpL+3vWL7qO3rdu5k+4jtuu16o9Ho+aDA1e7serOr7di7ioR7VNKtkv42IqYlfV/SH+/cKSIWIqIWEbVqtdrjMQHsH690tR17V5FwvyjpxYg40fr+YW2FHEAfzc5MqTI2csm2ytiIZmemBjQRBqVjuCPiPyR9x/b2/x13SXq61KkAvM6h6Ql99H1v18R4RZY0MV7RR9/3dl6YvAoVvark9yV9pnVFybckvb+8kQDs5tD0BKFGsXBHxGlJtZJnAQAUwDsnASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkRovsZPt5Sd+TtCnpfETUyhwKALC7QuFu+cWIeLm0SQAAhXCqBACSKRrukPRl2ydtHylzIADA5RU9VfILEbFm+yckHbf9bER85eIdWkE/Ikk33XRTj8cEAGwr9Iw7ItZa/z0n6RFJt7XZZyEiahFRq1arvZ0SAHBBx3Dbvs729du3Jf2ypCfLHgwA0F6RUyVvkfSI7e39PxsRXyp1KgDArjqGOyK+JekdfZgFAFAAlwMCQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIpnC4bY/YXrF9rMyBAACXN9rFvh+U9IykHytpFgyhpZU1zS+v6ux6U/vHK5qdmdKh6YlBjwVc1Qo947Z9QNJ7JB0tdxwMk6WVNc0tntHaelMhaW29qbnFM1paWRv0aMBVreipko9J+rCkH5Y4C4bM/PKqmhubl2xrbmxqfnl1QBMBkAqE2/Z7JZ2LiJMd9jtiu2673mg0ejYgBufserOr7QD6o8gz7jskHbT9vKTPSbrT9qd37hQRCxFRi4hatVrt8ZgYhP3jla62A+iPjuGOiLmIOBARk5IOS3o8Iu4rfTIM3OzMlCpjI5dsq4yNaHZmakATAZC6u6oEV5ntq0e4qgQYLo6Inj9orVaLer3e88cFgL3K9smIqBXZl3dOAkAyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkAzhBoBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASKZjuG1fY/vrtr9h+ynbH+nHYACA9kYL7PMDSXdGxGu2xyR91fZjEfG1kmcDALTRMdwREZJea3071vqKMocCAOyu0Dlu2yO2T0s6J+l4RJwodywAwG4KhTsiNiPiFkkHJN1m++ad+9g+Yrtuu95oNHo9JwCgpaurSiJiXdITkt7d5r6FiKhFRK1arfZqPgDADkWuKqnaHm/drki6W9KzZQ8GAGivyFUlN0h6wPaItkL/UEQcK3csAMBuilxV8k1J032YBQBQAO+cBIBkCDcAJEO4ASAZwg0AyRBuAEiGcANAMoQbAJIh3ACQDOEGgGQINwAkQ7gBIBnCDQDJEG4ASIZwA0AyhBsAkiHcAJAM4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCSIdwAkMxopx1s3yjpU5LeIikkLUTEx8sYZmllTfPLqzq73tT+8YpmZ6Z0aHqijB8FAGl1DLek85L+KCJO2b5e0knbxyPi6V4OsrSyprnFM2pubEqS1tabmls8I0nEGwAu0vFUSUS8FBGnWre/J+kZST0v6fzy6oVob2tubGp+ebXXPwoAUuvqHLftSUnTkk60ue+I7brteqPR6HqQs+vNrrYDwNWqcLhtv1HSP0n6UES8uvP+iFiIiFpE1KrVateD7B+vdLUdAK5WhcJte0xb0f5MRCyWMcjszJQqYyOXbKuMjWh2ZqqMHwcAaRW5qsSSPinpmYj4y7IG2X4BkqtKAODyilxVcoek35B0xvbp1rY/iYgv9nqYQ9MThBoAOugY7oj4qiT3YRYAQAG8cxIAkiHcAJAM4QaAZAg3ACRDuAEgGUdE7x/Ubkh64QoeYp+kl3s0ziDtlXVIe2ctrGP47JW1XOk6fioiCr3tvJRwXynb9YioDXqOK7VX1iHtnbWwjuGzV9bSz3VwqgQAkiHcAJDMsIZ7YdAD9MheWYe0d9bCOobPXllL39YxlOe4AQC7G9Zn3ACAXQws3Lb/zvY520/ucr9t/5Xt52x/0/at/Z6xiALreJftV2yfbn39ab9nLML2jbafsP207adsf7DNPlmOSZG1DP1xsX2N7a/b/kZrHR9ps8+P2v5865icaH1K1dApuJb7bTcuOia/M4hZi7A9YnvF9rE295V/TCJiIF+S3inpVklP7nL/r0p6TFv/MuHtkk4MatYrXMe7JB0b9JwF1nGDpFtbt6+X9K+S3pb0mBRZy9Afl9av8xtbt8e09ZGBt+/Y5/ckfaJ1+7Ckzw967itYy/2S/mbQsxZczx9K+my7/4f6cUwG9ow7Ir4i6b8us8s9kj4VW74madz2Df2ZrrgC60ghin0odJZj0pcPuC5b69f5tda3Y62vnS9K3SPpgdbthyXd1frwk6FScC0p2D4g6T2Sju6yS+nHZJjPcU9I+s5F37+ohL/5Wn6u9VfEx2z/9KCH6eQyHwqd7phc7gOuleC4tP5KflrSOUnHI2LXYxIR5yW9IunN/Z2ymAJrkaRfa52Ge9j2jX0esaiPSfqwpB/ucn/px2SYw71XnNLWW1nfIemvJS0NeJ7L6vSh0Jl0WEuK4xIRmxFxi6QDkm6zffOgZ/r/KrCWf5Y0GRE/I+m4/u9Z69Cw/V5J5yLi5CDnGOZwr0m6+E/cA61tqUTEq9t/RYytj3sbs71vwGO1VeBDodMck05ryXRcJCki1iU9IendO+66cExsj0p6k6Tv9ne67uy2loj4bkT8oPXtUUk/2+/ZCrhD0kHbz0v6nKQ7bX96xz6lH5NhDvejkn6zdSXD7ZJeiYiXBj1Ut2z/5Pb5Ldu3aevXfOh+YxX8UOgUx6TIWjIcF9tV2+Ot2xVJd0t6dsduj0r6rdbteyU9Hq1XxYZJkbXseL3koLZemxgqETEXEQciYlJbLzw+HhH37dit9GNS5MOCS2H7QW29sr/P9ouS/kxbL1goIj4h6YvauorhOUn/Len9g5n08gqs415Jv2v7vKSmpMPD+BtLu3wotKSbpFzHRMXWkuG43CDpAdsj2vqD5aGIOGb7zyXVI+JRbf0B9Y+2n9PWi+SHBzfuZRVZywdsH5R0XltruX9g03ap38eEd04CQDLDfKoEANAG4QaAZAg3ACRDuAEgGcINAMkQbgBIhnADQDKEGwCS+V9wIO885t6JhwAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "x = [1, 2, 3, 4]\n", "y = [2, 4, 6, 8]\n", "\n", "plt.scatter(x, y)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[アンスコムの4つ組](http://blog.livedoor.jp/oyajieng_memo/archives/1677707.html)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: numpy in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (1.15.1)\r\n", "Requirement already satisfied: pandas in /Users/k2works/.pyenv/versions/3.7.0/lib/python3.7/site-packages (0.23.4)\r\n", "Requirement already satisfied: pytz>=2011k in /Users/k2works/.local/lib/python3.7/site-packages (from pandas) (2018.5)\r\n", "Requirement already satisfied: python-dateutil>=2.5.0 in /Users/k2works/.local/lib/python3.7/site-packages (from pandas) (2.7.3)\r\n", "Requirement already satisfied: six>=1.5 in /Users/k2works/.local/lib/python3.7/site-packages (from python-dateutil>=2.5.0->pandas) (1.11.0)\r\n" ] } ], "source": [ "import sys\n", "!{sys.executable} -m pip install numpy pandas" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import Series,DataFrame\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "anscombe=sns.load_dataset(\"anscombe\", engine=\"python\")\n", "sns.lmplot(x=\"x\",y=\"y\",data=anscombe,fit_reg=False)\n", "plt.show()\n", "\n", "sns.lmplot(x=\"x\",y=\"y\",data=anscombe,fit_reg=False,hue=\"dataset\",col=\"dataset\",col_wrap=2)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ファイルからデータを読み込む\n", "### テキストファイルからデータを読み込む" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "..........\n", "----------------------------------------------------------------------\n", "Ran 10 tests in 0.005s\n", "\n", "OK\n" ] } ], "source": [ "import unittest\n", "\n", "\n", "def calculate_mean(numbers):\n", " s = sum(numbers)\n", " N = len(numbers)\n", " mean = s / N\n", "\n", " return mean\n", "\n", "'''\n", "ファイルに格納した平均を計算\n", "'''\n", "def read_data(filename):\n", " numbers = []\n", " with open(filename) as f:\n", " for line in f:\n", " numbers.append(float(line))\n", " return numbers\n", "\n", "\n", "class TestReadData(unittest.TestCase):\n", " def test_01(self):\n", " data = read_data('mydata.txt')\n", " mean = calculate_mean(data)\n", " self.assertEqual(mean, 477.75)\n", "\n", "\n", "if __name__ == '__main__':\n", " unittest.main(argv=['first-arg-is-ignored'], exit=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### CSVファイルからデータを読み込む" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "\n", "'''\n", "CSVファイルからデータを\b読み込む\n", "'''\n", "import csv\n", "import matplotlib.pyplot as plt\n", "\n", "def scatter_plot(x, y):\n", " plt.scatter(x, y)\n", " plt.xlabel('Number')\n", " plt.ylabel('Square')\n", " plt.show()\n", "\n", "def read_csv(filename):\n", " numbers = []\n", " squared = []\n", " with open(filename) as f:\n", " reader = csv.reader(f)\n", " next(reader)\n", " for row in reader:\n", " numbers.append(int(row[0]))\n", " squared.append(int(row[1]))\n", " return numbers, squared\n", "\n", "if __name__ == '__main__':\n", " numbers, squared = read_csv('numbers.csv')\n", " scatter_plot(numbers, squared)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[https://www.google.com/trends/correlate/](https://www.google.com/trends/correlate/)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Highest correlation: 0.9643403143357506\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "\n", "def read_csv(filename):\n", "\n", " with open(filename) as f:\n", " reader = csv.reader(f)\n", " next(reader)\n", "\n", " summer = []\n", " highest_correlated = []\n", " for row in reader:\n", " summer.append(float(row[1]))\n", " highest_correlated.append(float(row[2]))\n", "\n", " return summer, highest_correlated\n", "\n", "def scatter_plot(x,y):\n", " plt.scatter(x,y)\n", " plt.xlabel('Number')\n", " plt.ylabel('Square')\n", " plt.show()\n", " \n", "def find_corr_x_y(x,y):\n", " n = len(x)\n", " # 積の和を求める\n", " prod = []\n", " for xi,yi in zip(x,y):\n", " prod.append(xi*yi)\n", " sum_prod_x_y = sum(prod)\n", " sum_x = sum(x)\n", " sum_y = sum(y)\n", " squared_sum_x = sum_x**2\n", " squared_sum_y = sum_y**2\n", " x_square = []\n", " for xi in x:\n", " x_square.append(xi**2)\n", " # 和を求める\n", " x_square_sum = sum(x_square)\n", " y_square=[]\n", " for yi in y:\n", " y_square.append(yi**2)\n", " # 和を求める\n", " y_square_sum = sum(y_square)\n", "\n", " # 式を使って相関を計算\n", " numerator = n*sum_prod_x_y - sum_x*sum_y\n", " denominatior_term1 = n*x_square_sum - squared_sum_x\n", " denominatior_term2 = n*y_square_sum - squared_sum_y\n", " denominator = (denominatior_term1*denominatior_term2)**0.5\n", " correlation = numerator/denominator\n", "\n", " return correlation \n", "\n", "\n", "if __name__ == '__main__':\n", " summer, highest_correlated = read_csv('correlate-summer.csv')\n", " corr = find_corr_x_y(summer, highest_correlated)\n", " print('Highest correlation: {0}'.format(corr))\n", " scatter_plot(summer, highest_correlated)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 1 }