In [1]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from wordcloud import WordCloud
import seaborn as sns
import datetime
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt
import folium
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Input In [1], in <cell line: 10>()
8 from collections import Counter
9 import matplotlib.pyplot as plt
---> 10 import folium
ModuleNotFoundError: No module named 'folium'
In [4]:
df= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/worldcup/worldcupgoals.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1295 entries, 0 to 1294
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Player 1295 non-null object
1 Goals 1295 non-null int64
2 Years 1295 non-null object
3 Country 1295 non-null object
dtypes: int64(1), object(3)
memory usage: 40.6+ KB
In [5]:
df.head()
Out[5]:
Player | Goals | Years | Country | |
---|---|---|---|---|
0 | Miroslav Klose | 16 | 2002-2006-2010-2014 | Germany |
1 | Ronaldo | 15 | 1998-2002-2006 | Brazil |
2 | Gerd Muller | 14 | 1970-1974 | Germany |
3 | Just Fontaine | 13 | 1958 | France |
4 | Pele | 12 | 1958-1962-1966-1970 | Brazil |
In [6]:
df.groupby(['Country']).sum().sort_values('Goals', ascending=False).head(5)
Out[6]:
Goals | |
---|---|
Country | |
Brazil | 228 |
Germany | 226 |
Argentina | 135 |
Italy | 123 |
France | 115 |
In [7]:
df.groupby(['Country']).size().sort_values(ascending=False)
Out[7]:
Country
Brazil 81
Germany 78
Italy 60
Argentina 59
France 58
..
Haiti 1
Angola 1
Bolivia 1
Israel 1
Iraq 1
Length: 76, dtype: int64
In [8]:
df['year_list']=df.Years.str.split("-")
df.head()
Out[8]:
Player | Goals | Years | Country | year_list | |
---|---|---|---|---|---|
0 | Miroslav Klose | 16 | 2002-2006-2010-2014 | Germany | [2002, 2006, 2010, 2014] |
1 | Ronaldo | 15 | 1998-2002-2006 | Brazil | [1998, 2002, 2006] |
2 | Gerd Muller | 14 | 1970-1974 | Germany | [1970, 1974] |
3 | Just Fontaine | 13 | 1958 | France | [1958] |
4 | Pele | 12 | 1958-1962-1966-1970 | Brazil | [1958, 1962, 1966, 1970] |
In [9]:
def checkFour(x):
for value in x:
if len(str(value)) !=4:
return False
return True
df['check']= df['year_list'].apply(checkFour)
print(len(df[df['check']== False]))
45
In [10]:
df['LenCup']= df['year_list'].str.len()
df.head(10)
Out[10]:
Player | Goals | Years | Country | year_list | check | LenCup | |
---|---|---|---|---|---|---|---|
0 | Miroslav Klose | 16 | 2002-2006-2010-2014 | Germany | [2002, 2006, 2010, 2014] | True | 4 |
1 | Ronaldo | 15 | 1998-2002-2006 | Brazil | [1998, 2002, 2006] | True | 3 |
2 | Gerd Muller | 14 | 1970-1974 | Germany | [1970, 1974] | True | 2 |
3 | Just Fontaine | 13 | 1958 | France | [1958] | True | 1 |
4 | Pele | 12 | 1958-1962-1966-1970 | Brazil | [1958, 1962, 1966, 1970] | True | 4 |
5 | Jurgen Klinsmann | 11 | 1990-1994-1998 | Germany | [1990, 1994, 1998] | True | 3 |
6 | Sandor Kocsis | 11 | 1954 | Hungary | [1954] | True | 1 |
7 | Gabriel Batistuta | 10 | 1994-1998-2002 | Argentina | [1994, 1998, 2002] | True | 3 |
8 | Gary Lineker | 10 | 1986-1990 | England | [1986, 1990] | True | 2 |
9 | Thomas Muller | 10 | 2010-2014 | Germany | [2010, 2014] | True | 2 |
In [11]:
#df[df['LenCup']==4].info()
#월드컵 4회 출전 선수가 모두 몇명인지알려줘
df['LenCup'].value_counts()[4]
Out[11]:
16
In [12]:
#2002년에 출전한 선수 모두 출력
df[df.Years.str.contains('2002')]
Out[12]:
Player | Goals | Years | Country | year_list | check | LenCup | |
---|---|---|---|---|---|---|---|
0 | Miroslav Klose | 16 | 2002-2006-2010-2014 | Germany | [2002, 2006, 2010, 2014] | True | 4 |
1 | Ronaldo | 15 | 1998-2002-2006 | Brazil | [1998, 2002, 2006] | True | 3 |
7 | Gabriel Batistuta | 10 | 1994-1998-2002 | Argentina | [1994, 1998, 2002] | True | 3 |
20 | Christian Vieri | 9 | 1998-2002 | Italy | [1998, 2002] | True | 2 |
26 | Rivaldo | 8 | 1998-2002 | Brazil | [1998, 2002] | True | 2 |
... | ... | ... | ... | ... | ... | ... | ... |
1231 | Alvaro Recoba | 1 | 2002 | Uruguay | [2002] | True | 1 |
1233 | Dario Rodriguez | 1 | 2002 | Uruguay | [2002] | True | 1 |
1244 | Clint Mathis | 1 | 2002 | USA | [2002] | True | 1 |
1246 | John O Brien | 1 | 2002-2006 | USA | [2002, 2006] | True | 2 |
1247 | Earnie Stewart | 1 | 1994-1998-2002 | USA | [1994, 1998, 2002] | True | 3 |
156 rows × 7 columns
In [13]:
#이름이 carlos인 데이터 출력
In [14]:
df[df.Player.str.lower().str.contains('carlos')]
Out[14]:
Player | Goals | Years | Country | year_list | check | LenCup | |
---|---|---|---|---|---|---|---|
150 | Carlos Borges | 4 | 1954 | Uruguay | [1954] | True | 1 |
163 | Carlos Peucelle | 3 | 1930 | Argentina | [1930] | True | 1 |
165 | Carlos Tevez | 3 | 2006-2010 | Argentina | [2006, 2010] | True | 2 |
338 | Carlos Tenorio | 2 | 2002-2006 | Ecuador | [2002, 2006] | True | 2 |
511 | Carlos Babington | 1 | 1974 | Argentina | [1974] | True | 1 |
584 | Carlos Alberto | 1 | 1970 | Brazil | [1970] | True | 1 |
589 | Roberto Carlos | 1 | 1998-2002-2006 | Brazil | [1998, 2002, 2006] | True | 3 |
643 | Carlos Vidal | 1 | 1930 | Chile | [1930] | True | 1 |
655 | Carlos Valderrama | 1 | 1990-1994-1998 | Colombia | [1990, 1994, 1998] | True | 3 |
684 | Carlos Oliveira | 1 | 1938 | Cuba | [1938] | True | 1 |
940 | Carlos Vela | 1 | 2018 | Mexico | [2018] | True | 1 |
1041 | Carlos Manuel | 1 | 1986 | Portugal | [1986] | True | 1 |
1151 | Juan Carlos Valeron | 1 | 2002 | Spain | [2002] | True | 1 |
In [15]:
#월드컵 출전이 1회이지만 골을 가장 많이 넣은 선수
df[df['LenCup']==1].sort_values('Goals',ascending=False).Player.values[:5]
Out[15]:
array(['Just Fontaine', 'Sandor Kocsis', 'Ademir', 'Eusebio',
'Guillermo Stabile'], dtype=object)
In [16]:
#월드컵 출전이 1회인 선수가 가장 많은 나라는 어디입니까
df[df['LenCup']==1].Country.value_counts().index[0]
Out[16]:
'Brazil'
In [ ]:
In [ ]:
In [ ]:
'파이썬 활용 > 문자열' 카테고리의 다른 글
문자열 함수 모음집 (0) | 2022.10.25 |
---|---|
지하철 사용 빈도 알아보기 2 시계열 문자열 활용 (0) | 2022.10.25 |
CSV파일 불러온 후 문자열함수를 활용하여 전처리 하기 (0) | 2022.10.25 |
text파일을 통한 자연어 처리 빈도 분석(Okt, wordcloud) (0) | 2022.10.25 |
문자열 이용해서 간단한 프로그램 만들기(주민번호) (0) | 2022.10.24 |