from pyspark.sql import SparkSession
from pyspark.sql.types import * #data types
from pyspark.sql import functions as F #functions
spark=SparkSession.builder.appName('XIU-Daily').getOrCreate()
input_fn = 's-p-tsx-60-futures_01.csv'
df = spark.read.csv(input_fn,header=True,inferSchema=True)
df.show(3)
+-------------------+-----+
| date|value|
+-------------------+-----+
|1999-09-07 00:00:00|416.5|
|1999-09-08 00:00:00|417.2|
|1999-09-09 00:00:00|421.5|
+-------------------+-----+
df=df.withColumn('Date',F.date_format('date','yyyy-MM-dd')) #change date format
df=df.withColumn('current_date',F.current_date()) #current date
df=df.withColumn('year',F.year('date'))
df=df.withColumn('month',F.month('date'))
df=df.withColumn('dayofmonth',F.dayofmonth('date'))
df=df.withColumn('minute',F.minute('date'))
df=df.withColumn('second',F.second('date'))
df=df.withColumn('dayofyear',F.dayofyear('date'))
df=df.withColumn('dayofweek',F.dayofweek('date'))
df=df.withColumn('weekofyear',F.weekofyear('date'))
df=df.withColumn('quarter',F.quarter('date'))
df=df.withColumn('next_day_Mon',F.next_day('date','Mon'))
df=df.withColumn('next_day_Tue',F.next_day('date','Tue'))
df=df.withColumn('next_day_Wed',F.next_day('date','Wed'))
df=df.withColumn('next_day_Thu',F.next_day('date','Thu'))
df=df.withColumn('next_day_Fri',F.next_day('date','Fri'))
df=df.withColumn('next_day_Sat',F.next_day('date','Sat'))
df=df.withColumn('next_day_Sun',F.next_day('date','Sun'))
df=df.withColumn('trunc_year',F.trunc('date','year'))
df=df.withColumn('trunc_month',F.trunc('date','month'))
df=df.withColumn('last_day',F.last_day('date'))
df=df.withColumn('date_add_plus',F.date_add('Date',7)) # after 7 days
df=df.withColumn('date_add_minus',F.date_add('Date',-10)) # 10 days before
df=df.withColumn('date_sub',F.date_sub('date',10))
df=df.withColumn('months_between',F.months_between('date','current_date'))
df=df.withColumn('add_months_plus',F.add_months('Date',5)) # after 5 months
df=df.withColumn('add_months_minus',F.add_months('Date',-5)) # 5 months before
df=df.withColumn('Add5Years',F.add_months('Date',5*12)) # after 5 years
df=df.withColumn('sub5Years',F.add_months('Date',-5*12)) # 5 years ago
df.select('Date','year','month','dayofmonth','minute','second','last_day','dayofweek','next_day_Mon').show(5)
+----------+----+-----+----------+------+------+----------+---------+------------+
| Date|year|month|dayofmonth|minute|second| last_day|dayofweek|next_day_Mon|
+----------+----+-----+----------+------+------+----------+---------+------------+
|1999-09-07|1999| 9| 7| 0| 0|1999-09-30| 3| 1999-09-13|
|1999-09-08|1999| 9| 8| 0| 0|1999-09-30| 4| 1999-09-13|
|1999-09-09|1999| 9| 9| 0| 0|1999-09-30| 5| 1999-09-13|
|1999-09-10|1999| 9| 10| 0| 0|1999-09-30| 6| 1999-09-13|
|1999-09-13|1999| 9| 13| 0| 0|1999-09-30| 2| 1999-09-20|
+----------+----+-----+----------+------+------+----------+---------+------------+
Like this:
Like Loading...