You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iterator.go 17 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. // Copyright 2016 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package pubsub
  15. import (
  16. "context"
  17. "io"
  18. "sync"
  19. "time"
  20. vkit "cloud.google.com/go/pubsub/apiv1"
  21. "cloud.google.com/go/pubsub/internal/distribution"
  22. "github.com/golang/protobuf/proto"
  23. gax "github.com/googleapis/gax-go/v2"
  24. pb "google.golang.org/genproto/googleapis/pubsub/v1"
  25. "google.golang.org/grpc"
  26. "google.golang.org/grpc/codes"
  27. "google.golang.org/grpc/status"
  28. )
  29. // Between message receipt and ack (that is, the time spent processing a message) we want to extend the message
  30. // deadline by way of modack. However, we don't want to extend the deadline right as soon as the deadline expires;
  31. // instead, we'd want to extend the deadline a little bit of time ahead. gracePeriod is that amount of time ahead
  32. // of the actual deadline.
  33. const gracePeriod = 5 * time.Second
  34. type messageIterator struct {
  35. ctx context.Context
  36. cancel func() // the function that will cancel ctx; called in stop
  37. po *pullOptions
  38. ps *pullStream
  39. subc *vkit.SubscriberClient
  40. subName string
  41. kaTick <-chan time.Time // keep-alive (deadline extensions)
  42. ackTicker *time.Ticker // message acks
  43. nackTicker *time.Ticker // message nacks (more frequent than acks)
  44. pingTicker *time.Ticker // sends to the stream to keep it open
  45. failed chan struct{} // closed on stream error
  46. drained chan struct{} // closed when stopped && no more pending messages
  47. wg sync.WaitGroup
  48. mu sync.Mutex
  49. ackTimeDist *distribution.D // dist uses seconds
  50. // keepAliveDeadlines is a map of id to expiration time. This map is used in conjunction with
  51. // subscription.ReceiveSettings.MaxExtension to record the maximum amount of time (the
  52. // deadline, more specifically) we're willing to extend a message's ack deadline. As each
  53. // message arrives, we'll record now+MaxExtension in this table; whenever we have a chance
  54. // to update ack deadlines (via modack), we'll consult this table and only include IDs
  55. // that are not beyond their deadline.
  56. keepAliveDeadlines map[string]time.Time
  57. pendingAcks map[string]bool
  58. pendingNacks map[string]bool
  59. pendingModAcks map[string]bool // ack IDs whose ack deadline is to be modified
  60. err error // error from stream failure
  61. }
  62. // newMessageIterator starts and returns a new messageIterator.
  63. // subName is the full name of the subscription to pull messages from.
  64. // Stop must be called on the messageIterator when it is no longer needed.
  65. // The iterator always uses the background context for acking messages and extending message deadlines.
  66. func newMessageIterator(subc *vkit.SubscriberClient, subName string, po *pullOptions) *messageIterator {
  67. var ps *pullStream
  68. if !po.synchronous {
  69. ps = newPullStream(context.Background(), subc.StreamingPull, subName)
  70. }
  71. // The period will update each tick based on the distribution of acks. We'll start by arbitrarily sending
  72. // the first keepAlive halfway towards the minimum ack deadline.
  73. keepAlivePeriod := minAckDeadline / 2
  74. // Ack promptly so users don't lose work if client crashes.
  75. ackTicker := time.NewTicker(100 * time.Millisecond)
  76. nackTicker := time.NewTicker(100 * time.Millisecond)
  77. pingTicker := time.NewTicker(30 * time.Second)
  78. cctx, cancel := context.WithCancel(context.Background())
  79. it := &messageIterator{
  80. ctx: cctx,
  81. cancel: cancel,
  82. ps: ps,
  83. po: po,
  84. subc: subc,
  85. subName: subName,
  86. kaTick: time.After(keepAlivePeriod),
  87. ackTicker: ackTicker,
  88. nackTicker: nackTicker,
  89. pingTicker: pingTicker,
  90. failed: make(chan struct{}),
  91. drained: make(chan struct{}),
  92. ackTimeDist: distribution.New(int(maxAckDeadline/time.Second) + 1),
  93. keepAliveDeadlines: map[string]time.Time{},
  94. pendingAcks: map[string]bool{},
  95. pendingNacks: map[string]bool{},
  96. pendingModAcks: map[string]bool{},
  97. }
  98. it.wg.Add(1)
  99. go it.sender()
  100. return it
  101. }
  102. // Subscription.receive will call stop on its messageIterator when finished with it.
  103. // Stop will block until Done has been called on all Messages that have been
  104. // returned by Next, or until the context with which the messageIterator was created
  105. // is cancelled or exceeds its deadline.
  106. func (it *messageIterator) stop() {
  107. it.cancel()
  108. it.mu.Lock()
  109. it.checkDrained()
  110. it.mu.Unlock()
  111. it.wg.Wait()
  112. }
  113. // checkDrained closes the drained channel if the iterator has been stopped and all
  114. // pending messages have either been n/acked or expired.
  115. //
  116. // Called with the lock held.
  117. func (it *messageIterator) checkDrained() {
  118. select {
  119. case <-it.drained:
  120. return
  121. default:
  122. }
  123. select {
  124. case <-it.ctx.Done():
  125. if len(it.keepAliveDeadlines) == 0 {
  126. close(it.drained)
  127. }
  128. default:
  129. }
  130. }
  131. // Called when a message is acked/nacked.
  132. func (it *messageIterator) done(ackID string, ack bool, receiveTime time.Time) {
  133. it.ackTimeDist.Record(int(time.Since(receiveTime) / time.Second))
  134. it.mu.Lock()
  135. defer it.mu.Unlock()
  136. delete(it.keepAliveDeadlines, ackID)
  137. if ack {
  138. it.pendingAcks[ackID] = true
  139. } else {
  140. it.pendingNacks[ackID] = true
  141. }
  142. it.checkDrained()
  143. }
  144. // fail is called when a stream method returns a permanent error.
  145. // fail returns it.err. This may be err, or it may be the error
  146. // set by an earlier call to fail.
  147. func (it *messageIterator) fail(err error) error {
  148. it.mu.Lock()
  149. defer it.mu.Unlock()
  150. if it.err == nil {
  151. it.err = err
  152. close(it.failed)
  153. }
  154. return it.err
  155. }
  156. // receive makes a call to the stream's Recv method, or the Pull RPC, and returns
  157. // its messages.
  158. // maxToPull is the maximum number of messages for the Pull RPC.
  159. func (it *messageIterator) receive(maxToPull int32) ([]*Message, error) {
  160. it.mu.Lock()
  161. ierr := it.err
  162. it.mu.Unlock()
  163. if ierr != nil {
  164. return nil, ierr
  165. }
  166. // Stop retrieving messages if the iterator's Stop method was called.
  167. select {
  168. case <-it.ctx.Done():
  169. it.wg.Wait()
  170. return nil, io.EOF
  171. default:
  172. }
  173. var rmsgs []*pb.ReceivedMessage
  174. var err error
  175. if it.po.synchronous {
  176. rmsgs, err = it.pullMessages(maxToPull)
  177. } else {
  178. rmsgs, err = it.recvMessages()
  179. }
  180. // Any error here is fatal.
  181. if err != nil {
  182. return nil, it.fail(err)
  183. }
  184. msgs, err := convertMessages(rmsgs)
  185. if err != nil {
  186. return nil, it.fail(err)
  187. }
  188. // We received some messages. Remember them so we can keep them alive. Also,
  189. // do a receipt mod-ack when streaming.
  190. maxExt := time.Now().Add(it.po.maxExtension)
  191. ackIDs := map[string]bool{}
  192. it.mu.Lock()
  193. now := time.Now()
  194. for _, m := range msgs {
  195. m.receiveTime = now
  196. addRecv(m.ID, m.ackID, now)
  197. m.doneFunc = it.done
  198. it.keepAliveDeadlines[m.ackID] = maxExt
  199. // Don't change the mod-ack if the message is going to be nacked. This is
  200. // possible if there are retries.
  201. if !it.pendingNacks[m.ackID] {
  202. ackIDs[m.ackID] = true
  203. }
  204. }
  205. deadline := it.ackDeadline()
  206. it.mu.Unlock()
  207. if len(ackIDs) > 0 {
  208. if !it.sendModAck(ackIDs, deadline) {
  209. return nil, it.err
  210. }
  211. }
  212. return msgs, nil
  213. }
  214. // Get messages using the Pull RPC.
  215. // This may block indefinitely. It may also return zero messages, after some time waiting.
  216. func (it *messageIterator) pullMessages(maxToPull int32) ([]*pb.ReceivedMessage, error) {
  217. // Use it.ctx as the RPC context, so that if the iterator is stopped, the call
  218. // will return immediately.
  219. res, err := it.subc.Pull(it.ctx, &pb.PullRequest{
  220. Subscription: it.subName,
  221. MaxMessages: maxToPull,
  222. }, gax.WithGRPCOptions(grpc.MaxCallRecvMsgSize(maxSendRecvBytes)))
  223. switch {
  224. case err == context.Canceled:
  225. return nil, nil
  226. case err != nil:
  227. return nil, err
  228. default:
  229. return res.ReceivedMessages, nil
  230. }
  231. }
  232. func (it *messageIterator) recvMessages() ([]*pb.ReceivedMessage, error) {
  233. res, err := it.ps.Recv()
  234. if err != nil {
  235. return nil, err
  236. }
  237. return res.ReceivedMessages, nil
  238. }
  239. // sender runs in a goroutine and handles all sends to the stream.
  240. func (it *messageIterator) sender() {
  241. defer it.wg.Done()
  242. defer it.ackTicker.Stop()
  243. defer it.nackTicker.Stop()
  244. defer it.pingTicker.Stop()
  245. defer func() {
  246. if it.ps != nil {
  247. it.ps.CloseSend()
  248. }
  249. }()
  250. done := false
  251. for !done {
  252. sendAcks := false
  253. sendNacks := false
  254. sendModAcks := false
  255. sendPing := false
  256. dl := it.ackDeadline()
  257. select {
  258. case <-it.failed:
  259. // Stream failed: nothing to do, so stop immediately.
  260. return
  261. case <-it.drained:
  262. // All outstanding messages have been marked done:
  263. // nothing left to do except make the final calls.
  264. it.mu.Lock()
  265. sendAcks = (len(it.pendingAcks) > 0)
  266. sendNacks = (len(it.pendingNacks) > 0)
  267. // No point in sending modacks.
  268. done = true
  269. case <-it.kaTick:
  270. it.mu.Lock()
  271. it.handleKeepAlives()
  272. sendModAcks = (len(it.pendingModAcks) > 0)
  273. nextTick := dl - gracePeriod
  274. if nextTick <= 0 {
  275. // If the deadline is <= gracePeriod, let's tick again halfway to
  276. // the deadline.
  277. nextTick = dl / 2
  278. }
  279. it.kaTick = time.After(nextTick)
  280. case <-it.nackTicker.C:
  281. it.mu.Lock()
  282. sendNacks = (len(it.pendingNacks) > 0)
  283. case <-it.ackTicker.C:
  284. it.mu.Lock()
  285. sendAcks = (len(it.pendingAcks) > 0)
  286. case <-it.pingTicker.C:
  287. it.mu.Lock()
  288. // Ping only if we are processing messages via streaming.
  289. sendPing = !it.po.synchronous && (len(it.keepAliveDeadlines) > 0)
  290. }
  291. // Lock is held here.
  292. var acks, nacks, modAcks map[string]bool
  293. if sendAcks {
  294. acks = it.pendingAcks
  295. it.pendingAcks = map[string]bool{}
  296. }
  297. if sendNacks {
  298. nacks = it.pendingNacks
  299. it.pendingNacks = map[string]bool{}
  300. }
  301. if sendModAcks {
  302. modAcks = it.pendingModAcks
  303. it.pendingModAcks = map[string]bool{}
  304. }
  305. it.mu.Unlock()
  306. // Make Ack and ModAck RPCs.
  307. if sendAcks {
  308. if !it.sendAck(acks) {
  309. return
  310. }
  311. }
  312. if sendNacks {
  313. // Nack indicated by modifying the deadline to zero.
  314. if !it.sendModAck(nacks, 0) {
  315. return
  316. }
  317. }
  318. if sendModAcks {
  319. if !it.sendModAck(modAcks, dl) {
  320. return
  321. }
  322. }
  323. if sendPing {
  324. it.pingStream()
  325. }
  326. }
  327. }
  328. // handleKeepAlives modifies the pending request to include deadline extensions
  329. // for live messages. It also purges expired messages.
  330. //
  331. // Called with the lock held.
  332. func (it *messageIterator) handleKeepAlives() {
  333. now := time.Now()
  334. for id, expiry := range it.keepAliveDeadlines {
  335. if expiry.Before(now) {
  336. // This delete will not result in skipping any map items, as implied by
  337. // the spec at https://golang.org/ref/spec#For_statements, "For
  338. // statements with range clause", note 3, and stated explicitly at
  339. // https://groups.google.com/forum/#!msg/golang-nuts/UciASUb03Js/pzSq5iVFAQAJ.
  340. delete(it.keepAliveDeadlines, id)
  341. } else {
  342. // This will not conflict with a nack, because nacking removes the ID from keepAliveDeadlines.
  343. it.pendingModAcks[id] = true
  344. }
  345. }
  346. it.checkDrained()
  347. }
  348. func (it *messageIterator) sendAck(m map[string]bool) bool {
  349. // Account for the Subscription field.
  350. overhead := calcFieldSizeString(it.subName)
  351. return it.sendAckIDRPC(m, maxPayload-overhead, func(ids []string) error {
  352. recordStat(it.ctx, AckCount, int64(len(ids)))
  353. addAcks(ids)
  354. // Use context.Background() as the call's context, not it.ctx. We don't
  355. // want to cancel this RPC when the iterator is stopped.
  356. return it.subc.Acknowledge(context.Background(), &pb.AcknowledgeRequest{
  357. Subscription: it.subName,
  358. AckIds: ids,
  359. })
  360. })
  361. }
  362. // The receipt mod-ack amount is derived from a percentile distribution based
  363. // on the time it takes to process messages. The percentile chosen is the 99%th
  364. // percentile in order to capture the highest amount of time necessary without
  365. // considering 1% outliers.
  366. func (it *messageIterator) sendModAck(m map[string]bool, deadline time.Duration) bool {
  367. deadlineSec := int32(deadline / time.Second)
  368. // Account for the Subscription and AckDeadlineSeconds fields.
  369. overhead := calcFieldSizeString(it.subName) + calcFieldSizeInt(int(deadlineSec))
  370. return it.sendAckIDRPC(m, maxPayload-overhead, func(ids []string) error {
  371. if deadline == 0 {
  372. recordStat(it.ctx, NackCount, int64(len(ids)))
  373. } else {
  374. recordStat(it.ctx, ModAckCount, int64(len(ids)))
  375. }
  376. addModAcks(ids, deadlineSec)
  377. // Retry this RPC on Unavailable for a short amount of time, then give up
  378. // without returning a fatal error. The utility of this RPC is by nature
  379. // transient (since the deadline is relative to the current time) and it
  380. // isn't crucial for correctness (since expired messages will just be
  381. // resent).
  382. cctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
  383. defer cancel()
  384. bo := gax.Backoff{
  385. Initial: 100 * time.Millisecond,
  386. Max: time.Second,
  387. Multiplier: 2,
  388. }
  389. for {
  390. err := it.subc.ModifyAckDeadline(cctx, &pb.ModifyAckDeadlineRequest{
  391. Subscription: it.subName,
  392. AckDeadlineSeconds: deadlineSec,
  393. AckIds: ids,
  394. })
  395. switch status.Code(err) {
  396. case codes.Unavailable:
  397. if err := gax.Sleep(cctx, bo.Pause()); err == nil {
  398. continue
  399. }
  400. // Treat sleep timeout like RPC timeout.
  401. fallthrough
  402. case codes.DeadlineExceeded:
  403. // Timeout. Not a fatal error, but note that it happened.
  404. recordStat(it.ctx, ModAckTimeoutCount, 1)
  405. return nil
  406. default:
  407. // Any other error is fatal.
  408. return err
  409. }
  410. }
  411. })
  412. }
  413. func (it *messageIterator) sendAckIDRPC(ackIDSet map[string]bool, maxSize int, call func([]string) error) bool {
  414. ackIDs := make([]string, 0, len(ackIDSet))
  415. for k := range ackIDSet {
  416. ackIDs = append(ackIDs, k)
  417. }
  418. var toSend []string
  419. for len(ackIDs) > 0 {
  420. toSend, ackIDs = splitRequestIDs(ackIDs, maxSize)
  421. if err := call(toSend); err != nil {
  422. // The underlying client handles retries, so any error is fatal to the
  423. // iterator.
  424. it.fail(err)
  425. return false
  426. }
  427. }
  428. return true
  429. }
  430. // Send a message to the stream to keep it open. The stream will close if there's no
  431. // traffic on it for a while. By keeping it open, we delay the start of the
  432. // expiration timer on messages that are buffered by gRPC or elsewhere in the
  433. // network. This matters if it takes a long time to process messages relative to the
  434. // default ack deadline, and if the messages are small enough so that many can fit
  435. // into the buffer.
  436. func (it *messageIterator) pingStream() {
  437. // Ignore error; if the stream is broken, this doesn't matter anyway.
  438. _ = it.ps.Send(&pb.StreamingPullRequest{})
  439. }
  440. // calcFieldSizeString returns the number of bytes string fields
  441. // will take up in an encoded proto message.
  442. func calcFieldSizeString(fields ...string) int {
  443. overhead := 0
  444. for _, field := range fields {
  445. overhead += 1 + len(field) + proto.SizeVarint(uint64(len(field)))
  446. }
  447. return overhead
  448. }
  449. // calcFieldSizeInt returns the number of bytes int fields
  450. // will take up in an encoded proto message.
  451. func calcFieldSizeInt(fields ...int) int {
  452. overhead := 0
  453. for _, field := range fields {
  454. overhead += 1 + proto.SizeVarint(uint64(field))
  455. }
  456. return overhead
  457. }
  458. // splitRequestIDs takes a slice of ackIDs and returns two slices such that the first
  459. // ackID slice can be used in a request where the payload does not exceed maxSize.
  460. func splitRequestIDs(ids []string, maxSize int) (prefix, remainder []string) {
  461. size := 0
  462. i := 0
  463. // TODO(hongalex): Use binary search to find split index, since ackIDs are
  464. // fairly constant.
  465. for size < maxSize && i < len(ids) {
  466. size += calcFieldSizeString(ids[i])
  467. i++
  468. }
  469. if size > maxSize {
  470. i--
  471. }
  472. return ids[:i], ids[i:]
  473. }
  474. // The deadline to ack is derived from a percentile distribution based
  475. // on the time it takes to process messages. The percentile chosen is the 99%th
  476. // percentile - that is, processing times up to the 99%th longest processing
  477. // times should be safe. The highest 1% may expire. This number was chosen
  478. // as a way to cover most users' usecases without losing the value of
  479. // expiration.
  480. func (it *messageIterator) ackDeadline() time.Duration {
  481. pt := time.Duration(it.ackTimeDist.Percentile(.99)) * time.Second
  482. if pt > maxAckDeadline {
  483. return maxAckDeadline
  484. }
  485. if pt < minAckDeadline {
  486. return minAckDeadline
  487. }
  488. return pt
  489. }