Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
W
wanjia
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
kino
wanjia
Commits
c6b61ee2
Commit
c6b61ee2
authored
Dec 14, 2020
by
zhangminghui
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
47682d05
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
246 additions
and
0 deletions
+246
-0
api_assemble.job
wanjia_tuomin/micko/api_assemble.job
+3
-0
api_assemble.py
wanjia_tuomin/micko/api_assemble.py
+2
-0
api_assemble.sh
wanjia_tuomin/micko/api_assemble.sh
+2
-0
end.job
wanjia_tuomin/micko/end.job
+3
-0
file_handle.job
wanjia_tuomin/micko/file_handle.job
+3
-0
file_handle.py
wanjia_tuomin/micko/file_handle.py
+66
-0
file_handle.sh
wanjia_tuomin/micko/file_handle.sh
+22
-0
file_pretreatment.job
wanjia_tuomin/micko/file_pretreatment.job
+4
-0
file_pretreatment.py
wanjia_tuomin/micko/file_pretreatment.py
+99
-0
round_robin_mysql.sh
wanjia_tuomin/micko/round_robin_mysql.sh
+40
-0
start.job
wanjia_tuomin/micko/start.job
+2
-0
No files found.
wanjia_tuomin/micko/api_assemble.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=bash api_assemble.sh
\ No newline at end of file
wanjia_tuomin/micko/api_assemble.py
0 → 100644
View file @
c6b61ee2
if
__name__
==
'__main__'
:
print
(
"api_assemble.py"
)
\ No newline at end of file
wanjia_tuomin/micko/api_assemble.sh
0 → 100644
View file @
c6b61ee2
echo
"api_assemble.sh"
python api_assemble.py
\ No newline at end of file
wanjia_tuomin/micko/end.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=file_pretreatment,file_handle,api_assemble
command=echo "......Scheduling end"
\ No newline at end of file
wanjia_tuomin/micko/file_handle.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=bash file_handle.sh
\ No newline at end of file
wanjia_tuomin/micko/file_handle.py
0 → 100644
View file @
c6b61ee2
import
sys
import
pymysql
import
json
def
conn
(
sql
):
db
=
pymysql
.
connect
(
"192.168.1.90"
,
"root"
,
"root"
,
"demo_test"
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchall
()
db
.
commit
()
db
.
close
()
return
data
def
getColumns
(
column
):
try
:
print
(
column
)
list_cols
=
[[]
for
i
in
range
(
len
(
column
))]
for
i
in
range
(
0
,
len
(
column
)):
if
column
[
i
][
2
]
is
not
None
:
list_cols
[
i
]
.
append
(
column
[
i
][
2
])
else
:
list_cols
[
i
]
.
append
(
column
[
i
][
0
])
if
column
[
i
][
3
]
is
not
None
:
list_cols
[
i
]
.
append
(
column
[
i
][
3
])
else
:
list_cols
[
i
]
.
append
(
column
[
i
][
1
])
list_cols
[
i
]
.
append
(
column
[
i
][
4
])
print
(
list_cols
)
col
=
""
col_file
=
""
col_create
=
""
for
i
in
range
(
0
,
len
(
list_cols
)):
if
list_cols
[
i
][
2
]
==
"md5"
or
list_cols
[
i
][
2
]
==
"base64"
:
col
+=
list_cols
[
i
][
2
]
+
'('
+
list_cols
[
i
][
0
]
+
') as '
+
list_cols
[
i
][
0
]
+
", "
+
list_cols
[
i
][
0
]
+
' as t_'
+
list_cols
[
i
][
0
]
+
', '
# col_file += list_cols[i][2] + '(' + list_cols[i][0] + ') as ' + list_cols[i][0] + ', '
col_file
+=
list_cols
[
i
][
0
]
+
', '
else
:
col
+=
list_cols
[
i
][
0
]
+
', '
col_file
+=
list_cols
[
i
][
0
]
+
', '
col_create
+=
list_cols
[
i
][
0
]
+
" string, "
src_sql
=
'select
%
s from
%
s'
%
(
col
[
0
:
-
2
],
'a'
)
md5_sql
=
'select
%
s from
%
s'
%
(
col_file
[
0
:
-
2
],
'impala'
)
dir
=
'/test/dt'
create_sql
=
'create external table a (
%
s) stored as txt location "
%
s" '
%
(
col_create
[
0
:
-
2
],
dir
)
print
(
"---------------------->"
,
src_sql
)
print
(
"---------------------->"
,
md5_sql
)
print
(
"---------------------->"
,
create_sql
)
return
[
src_sql
,
md5_sql
,
create_sql
]
except
FileNotFoundError
as
e
:
print
(
"json pase exception:{0}"
.
format
(
e
))
if
__name__
==
'__main__'
:
# file_id = sys.argv[1]
# hdfs_path = sys.argv[2]
file_id
=
1
hdfs_path
=
"/user/datawarehouse/2020-12-10/user_info"
get_col_info_sql
=
'''select field_name,field_type,modify_field_name,modify_field_type,clean_rule
from t_file_save_field_info where file_deposit = '''
+
str
(
file_id
)
cols
=
conn
(
get_col_info_sql
)
getColumns
(
cols
)
# 执行create_sql建外表
# 执行src_sql落impala
# 执行md5_sql落ftp
# 更新状态
wanjia_tuomin/micko/file_handle.sh
0 → 100644
View file @
c6b61ee2
#!/bin/bash
set
-e
hostname
=
"192.168.1.90"
port
=
"3306"
username
=
"root"
password
=
"root"
dbname
=
"demo_test"
select_db_sql
=
"select file_deposit from t_file_handle_status where deposit_status='01'"
file_deposit
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
select_db_sql
}
"
)
echo
"file_deposit执行结果返回:
$file_deposit
"
if
[
${#
file_deposit
}
-gt
0
]
;
then
echo
"轮循到新增文件,文件id为:
$file_deposit
"
get_file_deposit_info_sql
=
"select file_name from t_file_deposit where file_deposit_id=
$file_deposit
"
get_file_deposit_info
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
get_file_deposit_info_sql
}
"
)
zip_file_name
=
`
echo
$get_file_deposit_info
|
awk
-F
' '
'{print $1}'
`
file_name
=
`
echo
$zip_file_name
|
cut
-d
\.
-f
1
`
hdfs_path
=
"/user/datawarehouse/
`
date
+%Y-%m-%d
`
/
$file_name
"
python3 file_handle.py
$file_deposit
$hdfs_path
else
echo
"未轮循到新增文件..."
fi
\ No newline at end of file
wanjia_tuomin/micko/file_pretreatment.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=echo "file_pretreatment"
# command=bash round_robin_mysql.sh
\ No newline at end of file
wanjia_tuomin/micko/file_pretreatment.py
0 → 100644
View file @
c6b61ee2
import
datetime
import
getpass
import
glob
import
os
import
shutil
import
zipfile
import
sys
import
pandas
as
pd
import
numpy
as
np
import
re
import
pymysql
def
unzip_all
(
source_dir
,
dest_dir
):
try
:
it
=
os
.
scandir
(
source_dir
)
for
entry
in
it
:
if
entry
.
is_file
()
and
os
.
path
.
splitext
(
entry
.
name
)[
1
]
==
'.zip'
:
if
not
os
.
path
.
exists
(
dest_dir
):
os
.
mkdir
(
dest_dir
)
zipfile
.
ZipFile
(
entry
.
path
)
.
extractall
(
path
=
dest_dir
)
else
:
zipfile
.
ZipFile
(
entry
.
path
)
.
extractall
(
path
=
dest_dir
)
except
Exception
as
e
:
print
(
"unzip_all error:{0}"
.
format
(
e
))
def
copy_file
(
srcPath
,
dtPath
):
success_code
=
0
try
:
for
root
,
dirs
,
files
in
os
.
walk
(
srcPath
,
True
):
for
fileName
in
files
:
if
fileName
.
endswith
(
'.csv'
):
shutil
.
copy
(
os
.
path
.
join
(
root
,
fileName
),
dtPath
)
success_code
=
1
return
success_code
except
Exception
as
e
:
print
(
"copy_file error:{0}"
.
format
(
e
))
def
analysis_csv
(
file_id
,
csv_path
):
csvFiles
=
glob
.
glob
(
csv_path
+
"/*"
)
list_
=
[]
list_error_file
=
[]
for
files
in
csvFiles
:
try
:
df
=
pd
.
read_csv
(
files
,
index_col
=
None
,
header
=
0
)
list_
.
append
(
df
)
except
Exception
as
e
:
list_error_file
.
append
(
files
)
continue
data
=
pd
.
concat
(
list_
,
ignore_index
=
True
)
print
(
data
)
ten_json
=
data
.
head
(
2
)
.
to_json
(
orient
=
'index'
)
update_sql
=
"update t_file_pretreatment_status set explore_json='"
+
ten_json
+
"' where file_deposit="
+
file_id
# 剔除所有值前后空格
stripstr
=
lambda
x
:
x
.
strip
()
if
isinstance
(
x
,
np
.
unicode
)
else
x
data
=
data
.
applymap
(
stripstr
)
for
i
in
data
.
keys
():
col
=
str
(
data
[
i
][
0
])
compile1
=
re
.
compile
(
'
\
d{4}[-/.]
\
d{1,2}[-/.]
\
d{1,2}
\
s
\
d{1,2}:
\
d{1,2}:
\
d{1,2}|
\
d{4}[-/.]
\
d{1,2}[-/.]
\
d{1,2}'
)
match_all
=
compile1
.
findall
(
col
)
if
len
(
match_all
)
!=
0
:
if
len
(
col
)
==
19
:
data
[
i
]
=
pd
.
to_datetime
(
data
[
i
])
elif
len
(
col
)
==
10
:
data
[
i
]
=
pd
.
to_datetime
(
data
[
i
])
str_col
=
""
for
i
in
range
(
0
,
len
(
data
.
dtypes
)):
str_col
+=
"("
+
str
(
file_id
)
+
", '"
+
str
(
data
.
columns
.
values
[
i
])
+
"', '"
+
str
(
data
.
dtypes
[
i
])
+
\
"', '"
+
str
(
datetime
.
datetime
.
now
())
+
"', '"
+
str
(
getpass
.
getuser
())
+
"'),"
insert_sql
=
"insert into t_file_save_field_info (file_deposit,field_name,field_type,cre_time,cre_person) values "
+
str_col
return
list_error_file
,
update_sql
,
insert_sql
[
0
:
-
1
]
if
__name__
==
'__main__'
:
get_file_deposit_info
=
sys
.
argv
print
(
get_file_deposit_info
)
get_file_id
=
get_file_deposit_info
[
1
]
get_ftp_path
=
get_file_deposit_info
[
3
]
get_output_hdfs_path
=
get_file_deposit_info
[
5
]
get_unzip_output_path
=
get_file_deposit_info
[
6
]
get_copy_output_path
=
get_file_deposit_info
[
7
]
unzip_all
(
get_ftp_path
,
get_unzip_output_path
)
su_code
=
copy_file
(
get_unzip_output_path
,
get_copy_output_path
)
if
su_code
==
1
:
os
.
system
(
"hdfs dfs -rm -r /user/tuomin_test"
)
os
.
system
(
"hdfs dfs -mkdir /user/tuomin_test"
)
pre_list
=
analysis_csv
(
get_file_id
,
get_copy_output_path
)
print
(
"csv无法读取的文件list:"
+
str
(
pre_list
[
0
]))
print
(
"十条json数据更新sql:"
+
str
(
pre_list
[
1
]))
print
(
"数据字段写入sql:"
+
str
(
pre_list
[
2
]))
db
=
pymysql
.
connect
(
"192.168.1.90"
,
"root"
,
"root"
,
"demo_test"
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
pre_list
[
1
])
cursor
.
execute
(
pre_list
[
2
])
db
.
commit
()
db
.
close
()
wanjia_tuomin/micko/round_robin_mysql.sh
0 → 100644
View file @
c6b61ee2
#!/bin/bash
set
-e
hostname
=
"192.168.1.90"
port
=
"3306"
username
=
"root"
password
=
"root"
dbname
=
"demo_test"
select_db_sql
=
"select file_deposit from t_file_pretreatment_status where pretreatment_status='01'"
file_deposit
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
select_db_sql
}
"
)
echo
"file_deposit执行结果返回:
$file_deposit
"
if
[
${#
file_deposit
}
-gt
0
]
;
then
echo
"轮循到新增文件,文件id为:
$file_deposit
"
get_file_deposit_info_sql
=
"select file_deposit_id,file_name,file_address,file_format from t_file_deposit where file_deposit_id=
$file_deposit
"
get_file_deposit_info
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
get_file_deposit_info_sql
}
"
)
zip_file_name
=
`
echo
$get_file_deposit_info
|
awk
-F
' '
'{print $2}'
`
file_name
=
`
echo
$zip_file_name
|
cut
-d
\.
-f
1
`
hdfs_path
=
"/user/datawarehouse/
`
date
+%Y-%m-%d
`
/
$file_name
"
unzip_output_path
=
"/root/azkaban/dst"
copy_output_path
=
"/root/azkaban/dt"
python3 file_pretreatment.py
$get_file_deposit_info
$hdfs_path
$unzip_output_path
$copy_output_path
hdfs dfs
-test
-d
/user/tuomin_test
if
[
$?
=
0
]
;
then
echo
"=====1===="
hdfs dfs
-test
-d
$hdfs_path
if
[
$?
!=
0
]
;
then
echo
"=====2===="
hdfs dfs
-mkdir
-p
$hdfs_path
hdfs dfs
-put
$copy_output_path
$hdfs_path
else
echo
"=====3===="
hdfs dfs
-rm
-r
$hdfs_path
hdfs dfs
-mkdir
-p
$hdfs_path
hdfs dfs
-put
$copy_output_path
$hdfs_path
fi
echo
"=========4====="
fi
else
echo
"未轮循到新增文件..."
fi
\ No newline at end of file
wanjia_tuomin/micko/start.job
0 → 100644
View file @
c6b61ee2
type=command
command=echo "Scheduling began....."
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment