df2[‘date’] = df1[‘date’].values df2[‘hour’] = df1[‘hour’].values It’s better use inter join case.tail(3) caseCA caseON caseDaily CAcaseDailyON case_Date_province 2020-04-26 47864 15411 1598.0 498.0 2020-04-27 49499 15868 1635.0 457.0 2020-04-28 50982 16337 1483.0 469.0 test.tail(3) testCA testON testDailyCA testDailyON date_testing 2020-04-26 734824 229638 23570.0 12020.0 2020-04-27 765056 242188 30232.0 12550.0 2020-04-28 787612 253040 22556.0 10852.0 case_test = ..
Category : Python
import pandas as pd from plotly.offline import iplot import cufflinks cufflinks.go_offline() # Set global theme cufflinks.set_config_file(world_readable=True, theme=’pearl’) fig=df.iplot(asFigure=True, mode=’lines+markers’, size=6, secondary_y = ‘Increase’, secondary_y_title=’Increase’, xTitle=’Date’, yTitle=’Cases’, title=’Projected COVID-19 Cases in South Korea’, theme=’solar’) fig.show() import pandas as pd import chart_studio.plotly as py from ipywidgets import interact, interact_manual import cufflinks as cf @interact def plot_ProjectedSouthKereaCOVID19(): fig=output.iplot(asFigure=True, ..
df.orderBy(‘colname1′,’colname2’,ascending=False) from pyspark.sql.functions import sort_array df = spark.createDataFrame([([2, 1, 3],),([1],),([],)], [‘data’]) df.show() +———+ | data| +———+ |[2, 1, 3]| | [1]| | []| +———+ df0=spark.createDataFrame(df.select(sort_array(df.data).alias(‘r’)).collect(),[‘data’] df0.show() +———+ | data| +———+ |[1, 2, 3]| | [1]| | []| +———+ df1=spark.createDataFrame(df.select(sort_array(df.data, asc=False).alias(‘r’)).collect(),[‘data’]) df1.show() +———+ | data| +———+ |[3, 2, 1]| | [1]| | []| +..
import pandas as pd import numpy as np # model from lmfit import Minimizer, Parameters, report_fit #plot import chart_studio.plotly as py import ipywidgets as widgets from ipywidgets import interact, interact_manual import cufflinks as cf theCountry=’Canada’ threshhold=10 theData=confirmed_series_21[confirmed_series_21[theCountry]>threshhold] data=theData[theCountry] start_date= data.index[0] end_date= data.index[-1] dateData=pd.date_range(start=start_date,end=end_date) forecastDays=60 dateForecast= pd.date_range(start=end_date,periods=forecastDays+1)[1:] dateObsForecast=dateData.append(dateForecast) #dateObsForecast # define objective function: returns the array ..
import pandas as pd import folium stations_fn=”LatitudeLongitude.csv” df=pd.read_csv(stations_fn) world_map = folium.Map(location=[10,0], tiles=”cartodbpositron”, zoom_start=2,max_zoom=25,min_zoom=2) for i in range(0,len(df)-100000): folium.Circle( location=[df.iloc[i][‘Latitude’],df.iloc[i][‘Longitude’]], radius=0.5, color=’#0066ff’, fill_color=’#3385ff’, fill=True).add_to(world_map)..
Rename columns x1 to x3, x2 to x4 from pyspark.sql import SparkSession spark=SparkSession.builder.appName(‘rename columns’).getOrCreate() data = spark.createDataFrame([(1,2), (3,4)], [‘x1’, ‘x2’]) data.show() data = data.withColumnRenamed(‘x1′,’x3’) \ .withColumnRenamed(‘x2’, ‘x4’) d..
import cv2 input_imgfn=”tobrighten.jpg” output_imgfn=”brightened.jpg” def change_brightness(img, value=30): hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) h, s, v = cv2.split(hsv) v = cv2.add(v,value) v[v > 255] = 255 v[v < 0] = 0 final_hsv = cv2.merge((h, s, v)) img = cv2.cvtColor(final_hsv, cv2.COLOR_HSV2BGR) return img img = cv2.imread(input_imgfn) #load rgb image img = change_brightness(img, value=90) #increases #img = change_brightness(img, value=-30) ..
output = pd.DataFrame({‘date’ : [],’Forecast’:[],’Cases’: [],’Fitting’:[],’Increase’:[]}) output[‘date’]=dateObsForecast output[‘Forecast’]=y1*last output[‘Cases’].iloc[:dataLen]=data.values*last output[‘Fitting’].iloc[:dataLen]=final.values*last output[‘Increase’].iloc[1:]=(y1[1:]-y1[:-1])*last output=output.set_ind..
Examples for pivot_table of Pandas and crosstab of Pyspark from my work directory:pyWorkDir/Bigdata/Pyspark/DataForYuanPei.ipynb pivot_table casepandas=indcases.toPandas() casetable1=pd.pivot_table(casepandas, values=’VALUE’, index=[“Case identifier number”], columns=[“Case information”], aggfunc=np.sum) crosstab casetable=casedf.crosstab(‘case_Date’,’province’) casetable=casetable.toPandas() casetable=casetable.sort_values(‘case_Date_province’) cumsum_casetable=casetable.set_index(‘case_Date_province’).cumsum() cumsum_casetable[‘CA’]=cumsum_casetable.sum(axis=1) casedftable=casedf.crosstab(‘case_Date’,’health_region’) health_region_table=casedftable.select([‘case_Date_health_region’,’Toronto’,’Montréal’,’Vancouver Coastal’,..
rename columns name: df = df.rename(columns={“oldcol1″:”newcol1″,”oldcol2”: “newcol2”}) change value of a column under a condition: df_confirmed.loc[df_confirmed[‘country’] == “US”, “country”] = “USA” replace NaN with some value: df_confirmed = df_confirmed.replace(np.nan, ”, regex=True) drop several columns(Lat and Long): df = df.drop([‘Lat’,’Long’..