 ߷׻ؿ

 giant_lock ݻϵ̲ƤϤʤXXX ײ
I/O ԤʤäƤϤʤ
ͣ㳰ϡdbq Ԥε̲ϻʤ

ȴǡͥåȥgiant_lock ݻޤޤäƤ
Ȥ롣Ϥ nonblocking ⡼ɤǹԤʤäƤ(Ȧ?)ʤΤǡ
ƤХåե⾮С̲ʤȦ
/* XXX FIXME too long giant lock */ ȤȤǡ
ƤβսޡƤȦ

 ᥿ǡ򡢥˥å夹롣
ǽͥܥǥ󥰤ñˤΤŪ
ǡ gfmdưƥɤ߹ߡΤӤ dbq ͳ DB 
񤭽Фȡư DB ɤ߹ߤԤʤȤϤʤ

xattr 㳰
ͳϡ
xattr ȤƤʤ礭ʥǡꤷƤ
߷׼Ԥ¾ʬȤ
Ǥ뤿ᡣ
դˡȤǡƤ⡢xattr ؤΥå
Ƥ餺٤Ȥ»बäߤϡ桼ꤷ
xattr ϥåǤ롣ޤgfarm.ncopy Ͼ˥å夹롣

 mutex 

mutex ֤ΰ¸ط˥롼פäƤϤʤǥåɥå롣ס

Ʊʣ mutex γԤʤγϡ٤ʸ񲽤ɬס

mutex ϡserver/gfmd/README 
XXX ݼ餵Ƥʤ׹

giant_lock -> struct host::replication_mutex -> dbq.mutex
	cf. host_peer_set()

giant_lock -> dfc_allq.mutex
giant_lock -> busyq.mutex -> removeq.mutex -> host:back_channel_mutex
	ʲξʣˤ롣
	giant_lock -> removeq.mutex
	busyq.mutex -> removeq.mutex
	busyq.mutex -> host:back_channel_mutex (for host_is_up())

 giant_lock ݻޤޡ̲ɤå

  ʲΥåɤΤߡgiant_lock ݻޤ޵̲Ƥɤ
  ʳΥåɤ̲Τϡ¸طΥ롼פ뤿
  ػߡ

  sync_protocol_thread_pool °륹å

 leaf routine ΤߤǳƤ mutex

    mutex ݻޤޡ¾ mutex 뤳ȤϤʤ
   ƤϤʤ

  callout_module.mutex

 åɰ

åɴ֤ΰ¸ط˥롼פäƤϤʤǥåɥå롣ס

thrpool_add_job() 硢
thrpool_add_job() ƤӽФåɤϡ
thrpool_add_job() 륹åɥסˡ
¸Ƥޤ
ʤthrpool_add_job() 륹åɥס뤬դξ硢
thrpool_add_job() 륹åɤϵ̲롣

åɥס°륹åɤ
ȤΥåɥסФ thrpool_add_job() Τܡס
Υåɥסåɤ
ޤƱ˼åɥסФ thrpool_add_job() ȡ
ס˶ʤΤǡƵ̲Τޤ޿ˤʤ롣
äȤ̾٤ǤСgfarm_metadb_job_queue_length Ĺ
塼ˤäƴ˾פ뤿ᡢΥǥåɥåȯʤ٤
塼դλ˾嵭ξȯȤޤ

åɥס֤ΰ¸ط˥롼פΤܡס
back_channel_send_thread_pool ϡ̿򤷤ưŤ, 
back_channel_recv_thread_pool ˰¸ƤΤաס

¸طοޤ
doc/internal/png/gfmd-thread-dependency.png
ˤ뤬οޤǡ
(å)éäåɴ֤롼פˤʤäƤƤϤʤ
Ȣ(åɥס)éäס֤롼פˤʤäƤƤϤʤ
ƱȢ(ƱΥåɥס)ˤ륹åɴ֤ˡ
  thrpool_add_job()طäƤϤʤ

 main å: accepting_loop() 

   TCP ³ԤԤʤΤ䡣

 create_detached_thread() Ǻ롢Ωåɡ

ʥƻ sigs_handler()

callout ƻ callout_main()  CALLOUT_NTHREADS

    callout_main()  thrpool_add_job() 륹åɥסο
    callout_main() åɤѰդɤʤȡ륹å
    ס뤬դȤʤäȤطʤåɤνޤٱ䤷Ƥޤ
    ߤϡback_channel_send_thread_pool ΤߤǤ뤿ᡢCALLOUT_NTHREADS == 1

    callout_reset() ƤΤϡback_channel_recv_thread_pool °
    gfs_async_client_status_result() callout_reset() 
    cond_signal() ƤʤΤǡcallout_reset() Ǥϰ¸ϵʤ

  db_thread()

ͥåȥƻ peer_watcher()  1

    peer_watcher()  thrpool_add_job() 륹åɥסο
    peer_watcher() åɤѰդɤʤȡ륹å
    ס뤬դȤʤäȤطʤåɤνޤٱ䤷Ƥޤ
    ߡsync_protocol_thread_pool  back_channel_recv_thread_pool  2
    뤿 peer_watcher() åɤ 2ĤˤԤ peer_watcher() 
    Ƥʤʤᡢ1ĤΤޤޡ
	 peer_watcher()  2åɤư褦ѹͽꡣXXX

    XXX DEADLOCK
    ʲͳǡdeadlock δ롣

    peer_watcher() ƤӽФ 2ĤΥåɥס롢sync_protocol_thread_pool
     back_channel_recv_thread_pool ϡback_channel_send_thread_pool 
    褦ˡƱ̤٤˹ԤʤäƵͤޤ뿴ۤϤʤ
    sync_protocol_thread_pool  back_channel_send_thread_pool ˰¸
    Ƥ뤿ᡢback_channel_send_thread_pool ¦ǡpeer_watcher() 
     sync_protocol_thread_pool Ф thrpool_add_job() ͤޤ
    ǽϻĤ롣
    peer_watcher()  1åɤΤߤȡpeer_watcher() Τߤޤä
    ޤᡢback_channel_recv_thread_pool ⴬ź򤯤äơߤޤ롣
    η̡back_channel_recv_thread_pool ˰¸Ƥ
    back_channel_send_thread_pool ưʤʤꡢ¸طΥ롼פȤʤäơ
    deadlock 롣

  peer_closer()

  backend_protocolξ硢filesystem nodeФ remover()  

    ѻߤơback_channel_recv_thread_pool Ȥ褦ˤͽꡣ

 åɥס

סγƥåɤμΤϡthrpool_worker()

 authentication_thread_pool

    ΥסѤƵưΤϡʲΥåɡ

    try_auth()
      accepting_loop()  thrpool_add_job() 롣

 sync_protocol_thread_pool

    ΥסѤƵưΤϡʲΥåɡ

    protocol_main()
      - try_auth() peer_authorized() ͳǡthrpool_add_job() 롣
      - peer_watcher() thrpool_add_job() 롣

 back_channel_send_thread_pool
  
    ΥסѤƵưΤϡʲΥåɡ

    gfs_async_client_status_request()
      - protocol_main() gfm_server_switch_async_back_channel() ͳǡ
        thrpool_add_job() 롣
      - callout_main() thrpool_add_job() 롣
	callout ˵ꤷƤΤϡback_channel_recv_thread_pool °
	gfs_async_client_status_result() callout_reset() ȤäƤΤ
	¸ϵʤ

    gfs_async_client_replication_request_request()
      - protocol_main() async_back_channel_replication_request() ͳǡ
        thrpool_add_job() 롣

 back_channel_recv_thread_pool

    ΥסѤƵưΤϡʲΥåɡ

    async_back_channel_main()
      - protocol_main() gfm_server_switch_async_back_channel() ͳǡ
        thrpool_add_job() 롣
      - peer_watcher() thrpool_add_job() 롣
	줬ȯΤϡgfsd  async RPC request 뤤ϡ
	gfmd  async RPC request Ф gfsd  reply 硣
	Ԥ gfmdgfsd  async RPC request ϡback_channel_send_thread_pool
	ѤƹԤʤ롣Τᡢasync_back_channel_main() ν
	ͤޤȡ̤Ȥơback_channel_send_thread_pool ޤǤͤޤ
	ǽ롣
	Τᡢback_channel_recv_thread_pool °륹åɤϡ
	ʲΥס°񸻤ԤäƤϤʤס
	 * authentication_thread_pool
	 * sync_protocol_thread_pool
	 * back_channel_send_thread_pool
	⤷Ԥʤȡ¸ط롼פdeadlock δ롣
	嵭Υס thrpool_add_job() Ƥ⤤ʤ
	ޤ嵭Υס°Ԥ mutex ԤäƤ⤤ʤ
	äơgiant_lock() ԤʤäƤܡ
	嵭Υסȶ礹뤬켫ȤǤ cond_wait() 
	Ԥʤʤ꡼դ mutex ԤĤʤʤ
	ޤgfm_async_server_replication_result() ϡľܡʲ
	ԤʤäƤϤʤ
		̤νΤ host_replicated()
		̤ peer_sender_lock() ԤĤȤ롣
				 ¦ԤĤȤ롣XXX DEADLOCK

 եκˤĤ

ºݤ˥夫inode_remove()Ƥ֡ݤΤϡ
ʲξ郎٤Ωä

(1) nlink  0
(2) ե뤬ץ󤵤Ƥʤ
(3) gfmd Ƴ replication ¹Ǥʤ

ʤ

	if (inode->i_nlink == 0 && inode->u.c.state == NULL &&
	    (!inode_is_file(inode) || inode->u.c.s.f.rstate == NULL))
		inode_remove(inode);


 back_channel ˴ؤ߷

 back_channel ɬפʻ (callout ʤ) 
   (1) peer ˵°뤫
   (2) host ˵°뤫
   (3) back_channel ̿Τι¤Τ˵°뤫
  Ȥ褬롣
  back_channel ̿λƱͤȡ¸֤ gfmd ư
  Ǥ (1)  (2) ǡ¤ΤäƤޤƱѤΰ
  γݤɬפȤʤ (3) ץκ꤬ñˤʤ롣
  ޤʲΤ褦 peer host ¿
	(peer ο == 饤Ȥο 2  host ο)
  Ūˤ (2) ͭʤΤǡhost ˵°뤳Ȥˤ
   async ˤĤƤϡconn Ʊͤ̿ɬפʥǡǤꡢ
  back_channel ͥڤؤäƤ뤬
  ŤͥޤĤäƤ˻Ȥǽθ (1) Ȥ롣

host_receiver_lock() ɬפͳ
   peer_watch() ƤӽФ졢Ʊ peer ʤƱ쥹åɤư
  뤿ᡢpeer ֤μȯʤ
  host_receiver_lock() Ƥͳϡpeer_free_request() Ȥζ
  Τᡣ

host_sender_lock()/_trylock() ɬפͳ
  - host_receiver_lock() Ʊ peer_free_request() Ȥζ
  - ξƱ peer ǤʣåɤǶ礹ΤǤβ
  - ¦ͤޤäˡƱ peer Υåɤǡåɥס뤬
    ԤΤɤ host_sender_trylock() ǻ롣
    gfm_async_server_replication_result() Τߤ _trylock() ǤϤʤ
    _lock() ȤäƤΤǡʣΥåɥסͭƤޤ
    ǽ뤬Ǥ뤿ᡢ¼ŪʴϤʤ㤤

host_disconnect_request() θƤӽФˤĤ
  ʣ back_channel åɤ䡢ե饦ɤ
  gfm_server_switch_back_channel_common() Ʊ˸ƤӽФ
  ǽ롣
  äˡե饦ɤǢե饦ɤǺ³Ť back_channel
  ³ ȤʤȤޤ
  кȤơʲΤ褦ˤ롣
   -  peer Ϥ줬ۤʤäƤ顢ʤˤ⤷ʤ
     եɤƤӽФ peer Ȥ NULL Ϥ
     ξˤϡ̵ͭ򤤤蘆Ǥ롣
   -  host->peer == NULL ʤ顢ʤˤ⤷ʤ
  ޤhost_sender_unlock()  host_receiver_unlock() ˤϡ
  ǤƤǽ뤿ᡢ peer Ϥ줬ۤʤäƤ
  顢ʤˤ⤷ʤ
  back_channel.c Ǥϡå򤫤 host_get_peer() ƤȤ
		if (peer != NULL) /* to make the race condition harmless */
			host_disconnect_request(host, peer);
  ȤƤʬ뤬 host_get_peer() ʹߡեɤ
  host_peer_set() Ԥʤ줿кξ硢GFS_PROTO_STATUS 
  ֻʤˤؤ餺host_disconnect_request()  if 
  ˸ƤФʤǽ鷺ˤ뤬ϼ GFS_PROTO_STATUS 
  ߺѤǤ롣

ۥ down к
  - gfs_client_status_request() ǡ󤫤ޤʤä
    硢ʤ host_status_reply_is_waiting() ΩäƤ
    ˤϡͥǤ롣(down ֤Ȥʤ)
  - λ host_sender_try_lock() ǽɤ
    ǧαƤˤϡhost_peer_busy() 
    ͥǤ

 եʣ

 ŪˤϡƱֹĥץꥫƱƤġ

㳰ϰʲ2ࡣ

- оݤȤơ񤭹Ѥ˥ץ󤵤Ƥץꥫ
	ϡץꥫ桢Ĥ¸ߤ롣
	gfmd Ūˤϡ
		(accmode_to_op(struct file_opening::flag) & GFS_W_OK) != 0
	Ȥ郎ΩäƤ
		struct file_opening::u.f.spool_host
	Υۥȡ
	ˡΥץꥫֹ夲Ʋ褹롣
- ץꥱΥե
	ץꥱϡžԴƤġ
	ξ硢FILE_COPY_IS_VALID(struct file_copy *) Ȥʤ롣

	FILE_COPY_IS_VALID(struct file_copy *) ȤʤΤϡʳˡ
	ƤϴΥץꥫʤ
	FILE_COPY_IS_BEING_REMOVED(struct file_copy *) Ȥʤץꥫ
	롣

 Ūˤϡ׵᤬Ф (dead_file_copy Ǥ) ΤϡΤ

㳰ϰʲ2

- ʤ餫ͳǥץꥱ󤬼Ԥơκ׵᤹
- gfrm -h Ρɡפʤ gfs_replica_remove_by_file() ׵ᤵ줿

2㳰ˤĤƤϡdead_file_copy ˲äơ
FILE_COPY_IS_BEING_REMOVED(struct file_copy *) 򿿤Ȥơ/ƻ뤹롣
dead_file_copy  load ˤϡΤ struct file_copy ɬפȤʤ롣

 եʣξ

 Ԥ gfmd Ƴץꥱγ

 

桼ʣ׵

եι
  ե뤬줿硢ޤʣäΡɤˡʣۤ롣

  Ūˤϡinode_remove_every_other_replicas() ǡto_be_replicated 
  򽸤ᡢ
	remove_replica_entity(, &deferred_cleanup)
	file_replicating_new()
	async_back_channel_replication_request()
  ȤƤȤ졣
  ǡdead_file_copy ϺƤ뤬ºݤ˺׵RPCϹԤʤäƤʤ
   dead_file_copy  deferred_cleanup ȤƼꡢ
  struct file_replicating ¸ʣλˡ׵Ԥʤ
  δ֡ dead_file_copy  dead_file_copy_mark_kept() ˤ
  ݸƤ롣

  ʤ (ʤ schedule_replication() )
  FILE_COPY_IS_BEING_REMOVED(copy) ʥΡɤ⹹̤褫
  뤬θϤʤƤɤ
  ʤʤ顢ۤȤƤΤϿΥץꥫǤ뤿ᡢ
  ץꥫǤ⡢¹ԤǤ뤿ᡣ
  ʲΤ褦ͳ顢θԤʤäƤ롣
  - ľʣ˼ԤƤС⼺Ԥǽ⤤Ȼפ
  - gfrm -h ΡɡפŪ˺줿ʤ顢ۤ٤ǤϤʤ

 ǡ¤

ʣ¹ϡ!FILE_COPY_IS_VALID() ȤʤäƤ롣
ʣλ inode_replicated() 顢inode_add_replica(inode, fr->dst, 1)
ƤӽФvalid = 1 ѹƤ롣

ʣԤ˽äˤϡinode_replicated() 顢
inode_remove_replica_gen_deferred(, &dfc) ƤӽФ
֤äƤ dfc  removal_pendingq_enqueue() Ϥƺ׵Ф
removal_pendingq_enqueue() ¦ǤϡʲνԤʤäƤ롣
ԤʣǿФΤǤС
	remove_replica_entity()  dead_file_copy 롣(DBˤ)
	valid == 0 Ǥꡢޤ file_copy  DB ˽񤫤ƤʤΤǡ
	DB Ф file_copy κϹԤʤʤ
	file_copy  free() 롣
	XXX FIXME λޤǤϡfile_copy  free() ԤʤäƤ
	ʤǤʤȡλˡƱֹǿʣ
	줿ꡢ뤤 truncate ξˤϡ񤭹ѥץꥫ
	򤵤ƤޤޤǤ롣ξ硢夫Ϥ׵
	ˤäơäƥե뤬äƤޤ
	ǿκ gfrm -h ʤ gfs_replica_remove_by_file()
	ʤ GFM_PROTO_REPLICA_REMOVE_BY_FILE  inode_remove_replica()
	ǤԤʤäƤ롣
	ϡʲǧ졢
	    https://sourceforge.net/apps/trac/gfarm/ticket/88
		#88 - race condition between a replication failure
		and another replication
	replication˴ؤƤϡr4657, r4659  dead_file_copy_remove() 
	Ƴ뤳Ȥˤ(Ψϰ)к줿
	r4955 ǡ񤭻ˤϡޤǥץꥫ¸ߤʤä
	ΡɤˤƱΥե륱ޤ줿ᡢ
	ˤʤä
	ޤϢ
	    https://sourceforge.net/apps/trac/gfarm/ticket/78
		#78 annoying pgsql error messages about deadfilecopy
	    https://sourceforge.net/apps/trac/gfarm/ticket/144
		#144 INode table in the PostgreSQL backend may not be
		corectly updated
ԤʣФΤǤС
	ñ remove_replica_entity()  dead_file_copy 롣(DBˤ)
	file_copy ˤĤƤ¸ߤʤΤǡԤʤʤ

⤷եιäˡŤˤĤʣ¹
СŤ struct file_copy Ϻdead_file_copy 
׵Ф inode_remove_every_other_replicas() 
	remove_replica_entity(, NULL)
ǹԤʤäƤ롣remove_replica_entity() Ϻǽ NULL ǤС
׵ȯԤ롣
XXX FIXME: race condition
ǡʣ gfsd ¦ǥ塼ίΤǸŤʣκ׵᤬
Ťʣʣ׵˼¹ԤǽŪˤϤꤦ롣
ξ硢ס˥ߤĤʤΤǡե򼺤
äϵʤ

 2.4.0 ꡼κˡͽȤƹͤƤ (ŤϤ˲)

replication_pendingq
replication_confirmingq
replication_finishedq
replication_busyq

requested	
newgen_waiting	ֹ湹Ԥ
pending		GFS_PROTO_REPLICATION_REQUESTԤ塼
in_flight	GFS_PROTO_REPLICATION_REQUEST׵
replicating	GFS_PROTO_REPLICATION_REQUEST
		GFM_PROTO_REPLICATION_RESULT׵μԤ
replicated	GFS_PROTO_REPLICATION_REQUEST̤ˤؤ餺
		GFM_PROTO_REPLICATION_RESULT׵  race condition
		GFS_PROTO_REPLICATION_REQUESTμԤ
confirming	GFS_PROTO_REPLICATION_REQUEST
		GFM_PROTO_REPLICATION_RESULT׵ξ
		GFM_PROTO_REPLICATION_RESULTԤ塼
finished	GFM_PROTO_REPLICATION_RESULT
		finalizing Ԥ塼
finalizing	λ
removal_waiting	ʣ˼ԤԴʥץꥫκԤ
		λޤǤϡcopy->valid == 0 ʾ֤ϰݻƤ롣
replication_waiting
		Ԥˡʥץꥱꤵ줿
		ξ硢ɬȥ饤롣
busy		back_channel busy֤Τᡢ
		GFS_PROTO_REPLICATION_REQUESTʤ
		GFM_PROTO_REPLICATION_RESULT
		ǽˤʤΤԤäƤ

XXX
 󥻥μ 
kill 硢ץȥƱΤǡ
gfsd connection cache  purge ɬס

XXX
 Ԥ硢ǽʤȥ饤

XXX 
 inode:rstate  on going replication ݻƤ
  inode ࡼ֤ȡinode  DB ä뤬ˤϻĤ롣
  ΤȤfile_copy  DB ꤫äƤʤ
  λgfmd ȡfile_copy  orphan ˤʤ롣

 եʣ˼Ԥ硢ۥȤԴʣϡ
  gfmd  dead_file_copy Ȥäƾäɤ
  Ȥ gfsd ʬǾäƤ顢եʣ̤𤹤
  Τɤ

race condition 򤱤̣Ǥϡgfsd äñ
ξ硢gfsd å夹ȡԴʥץꥫΥߤ
ĤäƤޤȤˤʤ롣
̣ǡgfmd äɤ

gfmd ϡõλޤǴƻ뤷δ֤ incomplete 
ץꥫ֤ݻ򤱤ɬפ롣
ʤξõλԤϡʣԤλԤʤɤ櫓ǤϤʤ
ñʤץꥫξ⡢incomplete ֤ΰݻɬס
ʤȡ¹ԤʣԤʤǽ롣

ʤߤ gfarm-2.3.0 Ǥ⡢gfmd äƤ

η郎
http://sourceforge.net/apps/trac/gfarm/ticket/88
https://dev.chubu.sra.co.jp/trac/gfarm/nttcom/ticket/5
https://xi.lab.sphere.ad.jp/trac/gfarm/ticket/62
ȤȤʤä

 ʣμ
	(a1) gfmd  gfsd ʣ׵Ф
	(a2) ʣ˼Ԥ
	(a3) gfmd ʣμΤä׵ (== dead_file_copy) 
	(a4) gfmd  gfsd ׵᤬ã졢Τä
 ⤦Ĥʣ
	(b1) Ʊե˴ؤƱʣؤʣ׵᤬Ф
	(b2) gfsd ʣ
Ȥ 2Ĥν
(a1)  (a2)  (a3)  (a4)  (b1)  (b2)
ȤǼ¹Ԥʤ
(a1)  (a2)  (a3)  (b1)  (b2)  (a4)
ȤǼ¹Ԥ롢ʣ줿Τä a4 ǾäƤޤΤǤޤ
кȤơ
	ʣ׵᤬ФƱץꥫФdead_file_copyä顢
	֤ kept/in_flight/finished/finalizing ʤС
		λޤԤ
	ʤ
		dead_file_copyä
Ȥ꤬ͤ롣λޤԤĤΤgfmd¤Ū
ݤʤΤǡȤꤢμȤơ
	֤ kept/in_flight/finished/finalizing ΥˤĤƤϡ
		ñ BUSY ֤
Ȥˡ⤢롣

 ջ:
  async_back_channel_replication_request() Ƥˤϡ
  dead_file_copy_remove() == GFARM_ERR_NO_ERROR ǧƤɬפ롣
  ʤȡ嵭 race condition ȯ롣

 XXX ̤ TO-DO

protocol_main() Ǽ resuming_thread() ǤԤˤʤ
  Τ뤬resume  callback Ǽ¸Ƥ뤿ᡢåɥס
  ԤΥǥåɥåˤϤʤʤ

