Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
W
wanjia
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
kino
wanjia
Commits
c6b61ee2
Commit
c6b61ee2
authored
4 years ago
by
zhangminghui
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
47682d05
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
246 additions
and
0 deletions
+246
-0
api_assemble.job
wanjia_tuomin/micko/api_assemble.job
+3
-0
api_assemble.py
wanjia_tuomin/micko/api_assemble.py
+2
-0
api_assemble.sh
wanjia_tuomin/micko/api_assemble.sh
+2
-0
end.job
wanjia_tuomin/micko/end.job
+3
-0
file_handle.job
wanjia_tuomin/micko/file_handle.job
+3
-0
file_handle.py
wanjia_tuomin/micko/file_handle.py
+66
-0
file_handle.sh
wanjia_tuomin/micko/file_handle.sh
+22
-0
file_pretreatment.job
wanjia_tuomin/micko/file_pretreatment.job
+4
-0
file_pretreatment.py
wanjia_tuomin/micko/file_pretreatment.py
+99
-0
round_robin_mysql.sh
wanjia_tuomin/micko/round_robin_mysql.sh
+40
-0
start.job
wanjia_tuomin/micko/start.job
+2
-0
No files found.
wanjia_tuomin/micko/api_assemble.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=bash api_assemble.sh
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/api_assemble.py
0 → 100644
View file @
c6b61ee2
if
__name__
==
'__main__'
:
print
(
"api_assemble.py"
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/api_assemble.sh
0 → 100644
View file @
c6b61ee2
echo
"api_assemble.sh"
python api_assemble.py
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/end.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=file_pretreatment,file_handle,api_assemble
command=echo "......Scheduling end"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/file_handle.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=bash file_handle.sh
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/file_handle.py
0 → 100644
View file @
c6b61ee2
import
sys
import
pymysql
import
json
def
conn
(
sql
):
db
=
pymysql
.
connect
(
"192.168.1.90"
,
"root"
,
"root"
,
"demo_test"
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
sql
)
data
=
cursor
.
fetchall
()
db
.
commit
()
db
.
close
()
return
data
def
getColumns
(
column
):
try
:
print
(
column
)
list_cols
=
[[]
for
i
in
range
(
len
(
column
))]
for
i
in
range
(
0
,
len
(
column
)):
if
column
[
i
][
2
]
is
not
None
:
list_cols
[
i
]
.
append
(
column
[
i
][
2
])
else
:
list_cols
[
i
]
.
append
(
column
[
i
][
0
])
if
column
[
i
][
3
]
is
not
None
:
list_cols
[
i
]
.
append
(
column
[
i
][
3
])
else
:
list_cols
[
i
]
.
append
(
column
[
i
][
1
])
list_cols
[
i
]
.
append
(
column
[
i
][
4
])
print
(
list_cols
)
col
=
""
col_file
=
""
col_create
=
""
for
i
in
range
(
0
,
len
(
list_cols
)):
if
list_cols
[
i
][
2
]
==
"md5"
or
list_cols
[
i
][
2
]
==
"base64"
:
col
+=
list_cols
[
i
][
2
]
+
'('
+
list_cols
[
i
][
0
]
+
') as '
+
list_cols
[
i
][
0
]
+
", "
+
list_cols
[
i
][
0
]
+
' as t_'
+
list_cols
[
i
][
0
]
+
', '
# col_file += list_cols[i][2] + '(' + list_cols[i][0] + ') as ' + list_cols[i][0] + ', '
col_file
+=
list_cols
[
i
][
0
]
+
', '
else
:
col
+=
list_cols
[
i
][
0
]
+
', '
col_file
+=
list_cols
[
i
][
0
]
+
', '
col_create
+=
list_cols
[
i
][
0
]
+
" string, "
src_sql
=
'select
%
s from
%
s'
%
(
col
[
0
:
-
2
],
'a'
)
md5_sql
=
'select
%
s from
%
s'
%
(
col_file
[
0
:
-
2
],
'impala'
)
dir
=
'/test/dt'
create_sql
=
'create external table a (
%
s) stored as txt location "
%
s" '
%
(
col_create
[
0
:
-
2
],
dir
)
print
(
"---------------------->"
,
src_sql
)
print
(
"---------------------->"
,
md5_sql
)
print
(
"---------------------->"
,
create_sql
)
return
[
src_sql
,
md5_sql
,
create_sql
]
except
FileNotFoundError
as
e
:
print
(
"json pase exception:{0}"
.
format
(
e
))
if
__name__
==
'__main__'
:
# file_id = sys.argv[1]
# hdfs_path = sys.argv[2]
file_id
=
1
hdfs_path
=
"/user/datawarehouse/2020-12-10/user_info"
get_col_info_sql
=
'''select field_name,field_type,modify_field_name,modify_field_type,clean_rule
from t_file_save_field_info where file_deposit = '''
+
str
(
file_id
)
cols
=
conn
(
get_col_info_sql
)
getColumns
(
cols
)
# 执行create_sql建外表
# 执行src_sql落impala
# 执行md5_sql落ftp
# 更新状态
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/file_handle.sh
0 → 100644
View file @
c6b61ee2
#!/bin/bash
set
-e
hostname
=
"192.168.1.90"
port
=
"3306"
username
=
"root"
password
=
"root"
dbname
=
"demo_test"
select_db_sql
=
"select file_deposit from t_file_handle_status where deposit_status='01'"
file_deposit
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
select_db_sql
}
"
)
echo
"file_deposit执行结果返回:
$file_deposit
"
if
[
${#
file_deposit
}
-gt
0
]
;
then
echo
"轮循到新增文件,文件id为:
$file_deposit
"
get_file_deposit_info_sql
=
"select file_name from t_file_deposit where file_deposit_id=
$file_deposit
"
get_file_deposit_info
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
get_file_deposit_info_sql
}
"
)
zip_file_name
=
`
echo
$get_file_deposit_info
|
awk
-F
' '
'{print $1}'
`
file_name
=
`
echo
$zip_file_name
|
cut
-d
\.
-f
1
`
hdfs_path
=
"/user/datawarehouse/
`
date
+%Y-%m-%d
`
/
$file_name
"
python3 file_handle.py
$file_deposit
$hdfs_path
else
echo
"未轮循到新增文件..."
fi
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/file_pretreatment.job
0 → 100644
View file @
c6b61ee2
type=command
dependencies=start
command=echo "file_pretreatment"
# command=bash round_robin_mysql.sh
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/file_pretreatment.py
0 → 100644
View file @
c6b61ee2
import
datetime
import
getpass
import
glob
import
os
import
shutil
import
zipfile
import
sys
import
pandas
as
pd
import
numpy
as
np
import
re
import
pymysql
def
unzip_all
(
source_dir
,
dest_dir
):
try
:
it
=
os
.
scandir
(
source_dir
)
for
entry
in
it
:
if
entry
.
is_file
()
and
os
.
path
.
splitext
(
entry
.
name
)[
1
]
==
'.zip'
:
if
not
os
.
path
.
exists
(
dest_dir
):
os
.
mkdir
(
dest_dir
)
zipfile
.
ZipFile
(
entry
.
path
)
.
extractall
(
path
=
dest_dir
)
else
:
zipfile
.
ZipFile
(
entry
.
path
)
.
extractall
(
path
=
dest_dir
)
except
Exception
as
e
:
print
(
"unzip_all error:{0}"
.
format
(
e
))
def
copy_file
(
srcPath
,
dtPath
):
success_code
=
0
try
:
for
root
,
dirs
,
files
in
os
.
walk
(
srcPath
,
True
):
for
fileName
in
files
:
if
fileName
.
endswith
(
'.csv'
):
shutil
.
copy
(
os
.
path
.
join
(
root
,
fileName
),
dtPath
)
success_code
=
1
return
success_code
except
Exception
as
e
:
print
(
"copy_file error:{0}"
.
format
(
e
))
def
analysis_csv
(
file_id
,
csv_path
):
csvFiles
=
glob
.
glob
(
csv_path
+
"/*"
)
list_
=
[]
list_error_file
=
[]
for
files
in
csvFiles
:
try
:
df
=
pd
.
read_csv
(
files
,
index_col
=
None
,
header
=
0
)
list_
.
append
(
df
)
except
Exception
as
e
:
list_error_file
.
append
(
files
)
continue
data
=
pd
.
concat
(
list_
,
ignore_index
=
True
)
print
(
data
)
ten_json
=
data
.
head
(
2
)
.
to_json
(
orient
=
'index'
)
update_sql
=
"update t_file_pretreatment_status set explore_json='"
+
ten_json
+
"' where file_deposit="
+
file_id
# 剔除所有值前后空格
stripstr
=
lambda
x
:
x
.
strip
()
if
isinstance
(
x
,
np
.
unicode
)
else
x
data
=
data
.
applymap
(
stripstr
)
for
i
in
data
.
keys
():
col
=
str
(
data
[
i
][
0
])
compile1
=
re
.
compile
(
'
\
d{4}[-/.]
\
d{1,2}[-/.]
\
d{1,2}
\
s
\
d{1,2}:
\
d{1,2}:
\
d{1,2}|
\
d{4}[-/.]
\
d{1,2}[-/.]
\
d{1,2}'
)
match_all
=
compile1
.
findall
(
col
)
if
len
(
match_all
)
!=
0
:
if
len
(
col
)
==
19
:
data
[
i
]
=
pd
.
to_datetime
(
data
[
i
])
elif
len
(
col
)
==
10
:
data
[
i
]
=
pd
.
to_datetime
(
data
[
i
])
str_col
=
""
for
i
in
range
(
0
,
len
(
data
.
dtypes
)):
str_col
+=
"("
+
str
(
file_id
)
+
", '"
+
str
(
data
.
columns
.
values
[
i
])
+
"', '"
+
str
(
data
.
dtypes
[
i
])
+
\
"', '"
+
str
(
datetime
.
datetime
.
now
())
+
"', '"
+
str
(
getpass
.
getuser
())
+
"'),"
insert_sql
=
"insert into t_file_save_field_info (file_deposit,field_name,field_type,cre_time,cre_person) values "
+
str_col
return
list_error_file
,
update_sql
,
insert_sql
[
0
:
-
1
]
if
__name__
==
'__main__'
:
get_file_deposit_info
=
sys
.
argv
print
(
get_file_deposit_info
)
get_file_id
=
get_file_deposit_info
[
1
]
get_ftp_path
=
get_file_deposit_info
[
3
]
get_output_hdfs_path
=
get_file_deposit_info
[
5
]
get_unzip_output_path
=
get_file_deposit_info
[
6
]
get_copy_output_path
=
get_file_deposit_info
[
7
]
unzip_all
(
get_ftp_path
,
get_unzip_output_path
)
su_code
=
copy_file
(
get_unzip_output_path
,
get_copy_output_path
)
if
su_code
==
1
:
os
.
system
(
"hdfs dfs -rm -r /user/tuomin_test"
)
os
.
system
(
"hdfs dfs -mkdir /user/tuomin_test"
)
pre_list
=
analysis_csv
(
get_file_id
,
get_copy_output_path
)
print
(
"csv无法读取的文件list:"
+
str
(
pre_list
[
0
]))
print
(
"十条json数据更新sql:"
+
str
(
pre_list
[
1
]))
print
(
"数据字段写入sql:"
+
str
(
pre_list
[
2
]))
db
=
pymysql
.
connect
(
"192.168.1.90"
,
"root"
,
"root"
,
"demo_test"
)
cursor
=
db
.
cursor
()
cursor
.
execute
(
pre_list
[
1
])
cursor
.
execute
(
pre_list
[
2
])
db
.
commit
()
db
.
close
()
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/round_robin_mysql.sh
0 → 100644
View file @
c6b61ee2
#!/bin/bash
set
-e
hostname
=
"192.168.1.90"
port
=
"3306"
username
=
"root"
password
=
"root"
dbname
=
"demo_test"
select_db_sql
=
"select file_deposit from t_file_pretreatment_status where pretreatment_status='01'"
file_deposit
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
select_db_sql
}
"
)
echo
"file_deposit执行结果返回:
$file_deposit
"
if
[
${#
file_deposit
}
-gt
0
]
;
then
echo
"轮循到新增文件,文件id为:
$file_deposit
"
get_file_deposit_info_sql
=
"select file_deposit_id,file_name,file_address,file_format from t_file_deposit where file_deposit_id=
$file_deposit
"
get_file_deposit_info
=
$(
mysql
-h
${
hostname
}
-P
${
port
}
-u
${
username
}
-p
${
password
}
${
dbname
}
-s
-e
"
${
get_file_deposit_info_sql
}
"
)
zip_file_name
=
`
echo
$get_file_deposit_info
|
awk
-F
' '
'{print $2}'
`
file_name
=
`
echo
$zip_file_name
|
cut
-d
\.
-f
1
`
hdfs_path
=
"/user/datawarehouse/
`
date
+%Y-%m-%d
`
/
$file_name
"
unzip_output_path
=
"/root/azkaban/dst"
copy_output_path
=
"/root/azkaban/dt"
python3 file_pretreatment.py
$get_file_deposit_info
$hdfs_path
$unzip_output_path
$copy_output_path
hdfs dfs
-test
-d
/user/tuomin_test
if
[
$?
=
0
]
;
then
echo
"=====1===="
hdfs dfs
-test
-d
$hdfs_path
if
[
$?
!=
0
]
;
then
echo
"=====2===="
hdfs dfs
-mkdir
-p
$hdfs_path
hdfs dfs
-put
$copy_output_path
$hdfs_path
else
echo
"=====3===="
hdfs dfs
-rm
-r
$hdfs_path
hdfs dfs
-mkdir
-p
$hdfs_path
hdfs dfs
-put
$copy_output_path
$hdfs_path
fi
echo
"=========4====="
fi
else
echo
"未轮循到新增文件..."
fi
\ No newline at end of file
This diff is collapsed.
Click to expand it.
wanjia_tuomin/micko/start.job
0 → 100644
View file @
c6b61ee2
type=command
command=echo "Scheduling began....."
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment